From f07ccd6e4fbc5bbfeb94d40e0f14bc527a7d5439 Mon Sep 17 00:00:00 2001 From: Manuel Woelker Date: Thu, 26 Jan 2017 20:28:23 +0100 Subject: [PATCH] Squashed 'cpp/simdcomp/' content from commit 0dca286 git-subtree-dir: cpp/simdcomp git-subtree-split: 0dca28668f1fb6d343dc3c62fa7750a00f1d7201 --- .gitignore | 9 + .travis.yml | 11 + CHANGELOG | 9 + LICENSE | 27 + README.md | 137 + benchmarks/benchmark.c | 235 + benchmarks/bitpackingbenchmark.c | 205 + example.c | 195 + go/README.md | 13 + go/test.go | 71 + include/avxbitpacking.h | 40 + include/portability.h | 81 + include/simdbitpacking.h | 72 + include/simdcomp.h | 22 + include/simdcomputil.h | 54 + include/simdfor.h | 72 + include/simdintegratedbitpacking.h | 98 + makefile | 79 + makefile.vc | 104 + package.json | 16 + scripts/avxpacking.py | 182 + scripts/simdfor.py | 152 + simdcomp.def | 40 + src/avxbitpacking.c | 7795 +++++++++ src/simdbitpacking.c | 14183 +++++++++++++++ src/simdcomputil.c | 234 + src/simdfor.c | 14501 ++++++++++++++++ src/simdintegratedbitpacking.c | 24882 +++++++++++++++++++++++++++ src/simdpackedsearch.c | 15866 +++++++++++++++++ src/simdpackedselect.c | 15490 +++++++++++++++++ tests/unit.c | 900 + tests/unit_chars.c | 102 + 32 files changed, 95877 insertions(+) create mode 100644 .gitignore create mode 100644 .travis.yml create mode 100644 CHANGELOG create mode 100644 LICENSE create mode 100644 README.md create mode 100644 benchmarks/benchmark.c create mode 100644 benchmarks/bitpackingbenchmark.c create mode 100644 example.c create mode 100644 go/README.md create mode 100644 go/test.go create mode 100644 include/avxbitpacking.h create mode 100644 include/portability.h create mode 100644 include/simdbitpacking.h create mode 100644 include/simdcomp.h create mode 100644 include/simdcomputil.h create mode 100644 include/simdfor.h create mode 100644 include/simdintegratedbitpacking.h create mode 100644 makefile create mode 100644 makefile.vc create mode 100644 package.json create mode 100755 scripts/avxpacking.py create mode 100755 scripts/simdfor.py create mode 100644 simdcomp.def create mode 100644 src/avxbitpacking.c create mode 100644 src/simdbitpacking.c create mode 100644 src/simdcomputil.c create mode 100644 src/simdfor.c create mode 100644 src/simdintegratedbitpacking.c create mode 100644 src/simdpackedsearch.c create mode 100644 src/simdpackedselect.c create mode 100644 tests/unit.c create mode 100644 tests/unit_chars.c diff --git a/.gitignore b/.gitignore new file mode 100644 index 000000000..b8334280c --- /dev/null +++ b/.gitignore @@ -0,0 +1,9 @@ +Makefile.in +lib* +unit* +*.o +src/*.lo +src/*.o +src/.deps +src/.dirstamp +src/.libs diff --git a/.travis.yml b/.travis.yml new file mode 100644 index 000000000..28f13df7a --- /dev/null +++ b/.travis.yml @@ -0,0 +1,11 @@ +language: c +sudo: false +compiler: + - gcc + - clang + +branches: + only: + - master + +script: make && ./unit diff --git a/CHANGELOG b/CHANGELOG new file mode 100644 index 000000000..bf56786f4 --- /dev/null +++ b/CHANGELOG @@ -0,0 +1,9 @@ +Upcoming + - added missing include + - improved portability (MSVC) + - implemented C89 compatibility +Version 0.0.3 (19 May 2014) + - improved documentation +Version 0.0.2 (6 February 2014) + - added go demo +Version 0.0.1 (5 February 2014) diff --git a/LICENSE b/LICENSE new file mode 100644 index 000000000..f3c5904df --- /dev/null +++ b/LICENSE @@ -0,0 +1,27 @@ +Copyright (c) 2014--, The authors +All rights reserved. + +Redistribution and use in source and binary forms, with or without modification, +are permitted provided that the following conditions are met: + +* Redistributions of source code must retain the above copyright notice, this + list of conditions and the following disclaimer. + +* Redistributions in binary form must reproduce the above copyright notice, this + list of conditions and the following disclaimer in the documentation and/or + other materials provided with the distribution. + +* Neither the name of the {organization} nor the names of its + contributors may be used to endorse or promote products derived from + this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR +ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON +ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. diff --git a/README.md b/README.md new file mode 100644 index 000000000..c8c45d1e2 --- /dev/null +++ b/README.md @@ -0,0 +1,137 @@ +The SIMDComp library +==================== +[![Build Status](https://travis-ci.org/lemire/simdcomp.png)](https://travis-ci.org/lemire/simdcomp) + +A simple C library for compressing lists of integers using binary packing and SIMD instructions. +The assumption is either that you have a list of 32-bit integers where most of them are small, or a list of 32-bit integers where differences between successive integers are small. No software is able to reliably compress an array of 32-bit random numbers. + +This library can decode at least 4 billions of compressed integers per second on most +desktop or laptop processors. That is, it can decompress data at a rate of 15 GB/s. +This is significantly faster than generic codecs like gzip, LZO, Snappy or LZ4. + +On a Skylake Intel processor, it can decode integers at a rate 0.3 cycles per integer, +which can easily translate into more than 8 decoded billions integers per second. + +Contributors: Daniel Lemire, Nathan Kurz, Christoph Rupp, Anatol Belski, Nick White and others + +What is it for? +------------- + +This is a low-level library for fast integer compression. By design it does not define a compressed +format. It is up to the (sophisticated) user to create a compressed format. + +Requirements +------------- + +- Your processor should support SSE4.1 (It is supported by most Intel and AMD processors released since 2008.) +- It is possible to build the core part of the code if your processor support SSE2 (Pentium4 or better) +- C99 compliant compiler (GCC is assumed) +- A Linux-like distribution is assumed by the makefile + +For a plain C version that does not use SIMD instructions, see https://github.com/lemire/LittleIntPacker + +Usage +------- + +Compression works over blocks of 128 integers. + +For a complete working example, see example.c (you can build it and +run it with "make example; ./example"). + + + +1) Lists of integers in random order. + +```C +const uint32_t b = maxbits(datain);// computes bit width +simdpackwithoutmask(datain, buffer, b);//compressed to buffer, compressing 128 32-bit integers down to b*32 bytes +simdunpack(buffer, backbuffer, b);//uncompressed to backbuffer +``` + +While 128 32-bit integers are read, only b 128-bit words are written. Thus, the compression ratio is 32/b. + +2) Sorted lists of integers. + +We used differential coding: we store the difference between successive integers. For this purpose, we need an initial value (called offset). + +```C +uint32_t offset = 0; +uint32_t b1 = simdmaxbitsd1(offset,datain); // bit width +simdpackwithoutmaskd1(offset, datain, buffer, b1);//compressing 128 32-bit integers down to b1*32 bytes +simdunpackd1(offset, buffer, backbuffer, b1);//uncompressed +``` + +General example for arrays of arbitrary length: +```C +int compress_decompress_demo() { + size_t k, N = 9999; + __m128i * endofbuf; + uint32_t * datain = malloc(N * sizeof(uint32_t)); + uint8_t * buffer; + uint32_t * backbuffer = malloc(N * sizeof(uint32_t)); + uint32_t b; + + for (k = 0; k < N; ++k){ /* start with k=0, not k=1! */ + datain[k] = k; + } + + b = maxbits_length(datain, N); + buffer = malloc(simdpack_compressedbytes(N,b)); // allocate just enough memory + endofbuf = simdpack_length(datain, N, (__m128i *)buffer, b); + /* compressed data is stored between buffer and endofbuf using (endofbuf-buffer)*sizeof(__m128i) bytes */ + /* would be safe to do : buffer = realloc(buffer,(endofbuf-(__m128i *)buffer)*sizeof(__m128i)); */ + simdunpack_length((const __m128i *)buffer, N, backbuffer, b); + + for (k = 0; k < N; ++k){ + if(datain[k] != backbuffer[k]) { + printf("bug\n"); + return -1; + } + } + return 0; +} +``` + + +3) Frame-of-Reference + +We also have frame-of-reference (FOR) functions (see simdfor.h header). They work like the bit packing +routines, but do not use differential coding so they allow faster search in some cases, at the expense +of compression. + +Setup +--------- + + +make +make test + +and if you are daring: + +make install + +Go +-------- + +If you are a go user, there is a "go" folder where you will find a simple demo. + +Other libraries +---------------- + +* Fast decoder for VByte-compressed integers https://github.com/lemire/MaskedVByte +* Fast integer compression in C using StreamVByte https://github.com/lemire/streamvbyte +* FastPFOR is a C++ research library well suited to compress unsorted arrays: https://github.com/lemire/FastPFor +* SIMDCompressionAndIntersection is a C++ research library well suited for sorted arrays (differential coding) +and computing intersections: https://github.com/lemire/SIMDCompressionAndIntersection +* TurboPFor is a C library that offers lots of interesting optimizations. Well worth checking! (GPL license) https://github.com/powturbo/TurboPFor +* Oroch is a C++ library that offers a usable API (MIT license) https://github.com/ademakov/Oroch + + +References +------------ + +* Daniel Lemire, Leonid Boytsov, Nathan Kurz, SIMD Compression and the Intersection of Sorted Integers, Software Practice & Experience 46 (6) 2016. http://arxiv.org/abs/1401.6399 +* Daniel Lemire and Leonid Boytsov, Decoding billions of integers per second through vectorization, Software Practice & Experience 45 (1), 2015. http://arxiv.org/abs/1209.2137 http://onlinelibrary.wiley.com/doi/10.1002/spe.2203/abstract +* Jeff Plaisance, Nathan Kurz, Daniel Lemire, Vectorized VByte Decoding, International Symposium on Web Algorithms 2015, 2015. http://arxiv.org/abs/1503.07387 +* Wayne Xin Zhao, Xudong Zhang, Daniel Lemire, Dongdong Shan, Jian-Yun Nie, Hongfei Yan, Ji-Rong Wen, A General SIMD-based Approach to Accelerating Compression Algorithms, ACM Transactions on Information Systems 33 (3), 2015. http://arxiv.org/abs/1502.01916 +* T. D. Wu, Bitpacking techniques for indexing genomes: I. Hash tables, Algorithms for Molecular Biology 11 (5), 2016. http://almob.biomedcentral.com/articles/10.1186/s13015-016-0069-5 diff --git a/benchmarks/benchmark.c b/benchmarks/benchmark.c new file mode 100644 index 000000000..783ef9485 --- /dev/null +++ b/benchmarks/benchmark.c @@ -0,0 +1,235 @@ +/** + * This code is released under a BSD License. + */ +#include +#include +#include +#include + +#include "simdcomp.h" + +#ifdef _MSC_VER +# include + +__int64 freq; + +typedef __int64 time_snap_t; + +static time_snap_t time_snap(void) +{ + __int64 now; + + QueryPerformanceCounter((LARGE_INTEGER *)&now); + + return (__int64)((now*1000000)/freq); +} +# define TIME_SNAP_FMT "%I64d" +#else +# define time_snap clock +# define TIME_SNAP_FMT "%lu" +typedef clock_t time_snap_t; +#endif + + +void benchmarkSelect() { + uint32_t buffer[128]; + uint32_t backbuffer[128]; + uint32_t initial = 33; + uint32_t b; + time_snap_t S1, S2, S3; + int i; + printf("benchmarking select \n"); + + /* this test creates delta encoded buffers with different bits, then + * performs lower bound searches for each key */ + for (b = 0; b <= 32; b++) { + uint32_t prev = initial; + uint32_t out[128]; + /* initialize the buffer */ + for (i = 0; i < 128; i++) { + buffer[i] = ((uint32_t)(1655765 * i )) ; + if(b < 32) buffer[i] %= (1< *ib) + return 1; + return 0; +} + +/* adapted from wikipedia */ +int binary_search(uint32_t * A, uint32_t key, int imin, int imax) +{ + int imid; + imax --; + while(imin + 1 < imax) { + imid = imin + ((imax - imin) / 2); + + if (A[imid] > key) { + imax = imid; + } else if (A[imid] < key) { + imin = imid; + } else { + return imid; + } + } + return imax; +} + + +/* adapted from wikipedia */ +int lower_bound(uint32_t * A, uint32_t key, int imin, int imax) +{ + int imid; + imax --; + while(imin + 1 < imax) { + imid = imin + ((imax - imin) / 2); + + if (A[imid] >= key) { + imax = imid; + } else if (A[imid] < key) { + imin = imid; + } + } + if(A[imin] >= key) return imin; + return imax; +} + +void benchmarkSearch() { + uint32_t buffer[128]; + uint32_t backbuffer[128]; + uint32_t out[128]; + uint32_t result, initial = 0; + uint32_t b, i; + time_snap_t S1, S2, S3, S4; + + printf("benchmarking search \n"); + + /* this test creates delta encoded buffers with different bits, then + * performs lower bound searches for each key */ + for (b = 0; b <= 32; b++) { + uint32_t prev = initial; + /* initialize the buffer */ + for (i = 0; i < 128; i++) { + buffer[i] = ((uint32_t)rand()) ; + if(b < 32) buffer[i] %= (1< 0) { + if(buffer[pos-1] >= pseudorandomkey) + printf("bug B.\n"); + } + } + S2 = time_snap(); + for (i = 0; i < 128 * 10; i++) { + int pos; + uint32_t pseudorandomkey = buffer[i%128]; + simdunpackd1(initial, (__m128i *)out, backbuffer, b); + pos = lower_bound(backbuffer, pseudorandomkey, 0, 128); + result = backbuffer[pos]; + + if((result < pseudorandomkey) || (buffer[pos] != result)) { + printf("bug C.\n"); + } else if (pos > 0) { + if(buffer[pos-1] >= pseudorandomkey) + printf("bug D.\n"); + } + } + S3 = time_snap(); + for (i = 0; i < 128 * 10; i++) { + + int pos; + uint32_t pseudorandomkey = buffer[i%128]; + pos = simdsearchwithlengthd1(initial, (__m128i *)out, b, 128, + pseudorandomkey, &result); + if((result < pseudorandomkey) || (buffer[pos] != result)) { + printf("bug A.\n"); + } else if (pos > 0) { + if(buffer[pos-1] >= pseudorandomkey) + printf("bug B.\n"); + } + } + S4 = time_snap(); + + printf("bit width = %d, fast search function time = " TIME_SNAP_FMT ", naive time = " TIME_SNAP_FMT " , fast with length time = " TIME_SNAP_FMT " \n", b, (S2-S1), (S3-S2), (S4-S3) ); + } +} + + +int main() { +#ifdef _MSC_VER + QueryPerformanceFrequency((LARGE_INTEGER *)&freq); +#endif + benchmarkSearch(); + benchmarkSelect(); + return 0; +} diff --git a/benchmarks/bitpackingbenchmark.c b/benchmarks/bitpackingbenchmark.c new file mode 100644 index 000000000..4426d9897 --- /dev/null +++ b/benchmarks/bitpackingbenchmark.c @@ -0,0 +1,205 @@ +#include + +#include "simdcomp.h" + + +#define RDTSC_START(cycles) \ + do { \ + register unsigned cyc_high, cyc_low; \ + __asm volatile( \ + "cpuid\n\t" \ + "rdtsc\n\t" \ + "mov %%edx, %0\n\t" \ + "mov %%eax, %1\n\t" \ + : "=r"(cyc_high), "=r"(cyc_low)::"%rax", "%rbx", "%rcx", "%rdx"); \ + (cycles) = ((uint64_t)cyc_high << 32) | cyc_low; \ + } while (0) + +#define RDTSC_FINAL(cycles) \ + do { \ + register unsigned cyc_high, cyc_low; \ + __asm volatile( \ + "rdtscp\n\t" \ + "mov %%edx, %0\n\t" \ + "mov %%eax, %1\n\t" \ + "cpuid\n\t" \ + : "=r"(cyc_high), "=r"(cyc_low)::"%rax", "%rbx", "%rcx", "%rdx"); \ + (cycles) = ((uint64_t)cyc_high << 32) | cyc_low; \ + } while (0) + + + + +uint32_t * get_random_array_from_bit_width(uint32_t length, uint32_t bit) { + uint32_t * answer = malloc(sizeof(uint32_t) * length); + uint32_t mask = (uint32_t) ((UINT64_C(1) << bit) - 1); + uint32_t i; + for(i = 0; i < length; ++i) { + answer[i] = rand() & mask; + } + return answer; +} + +uint32_t * get_random_array_from_bit_width_d1(uint32_t length, uint32_t bit) { + uint32_t * answer = malloc(sizeof(uint32_t) * length); + uint32_t mask = (uint32_t) ((UINT64_C(1) << bit) - 1); + uint32_t i; + answer[0] = rand() & mask; + for(i = 1; i < length; ++i) { + answer[i] = answer[i-1] + (rand() & mask); + } + return answer; +} + + +void demo128() { + const uint32_t length = 128; + uint32_t bit; + printf("# --- %s\n", __func__); + printf("# compressing %d integers\n",length); + printf("# format: bit width, pack in cycles per int, unpack in cycles per int\n"); + for(bit = 1; bit <= 32; ++bit) { + uint32_t i; + + uint32_t * data = get_random_array_from_bit_width(length, bit); + __m128i * buffer = malloc(length * sizeof(uint32_t)); + uint32_t * backdata = malloc(length * sizeof(uint32_t)); + uint32_t repeat = 500; + uint64_t min_diff; + printf("%d\t",bit); + min_diff = (uint64_t)-1; + for (i = 0; i < repeat; i++) { + uint64_t cycles_start, cycles_final, cycles_diff; + __asm volatile("" ::: /* pretend to clobber */ "memory"); + RDTSC_START(cycles_start); + simdpackwithoutmask(data,buffer, bit); + RDTSC_FINAL(cycles_final); + cycles_diff = (cycles_final - cycles_start); + if (cycles_diff < min_diff) min_diff = cycles_diff; + } + printf("%.2f\t",min_diff*1.0/length); + min_diff = (uint64_t)-1; + for (i = 0; i < repeat; i++) { + uint64_t cycles_start, cycles_final, cycles_diff; + __asm volatile("" ::: /* pretend to clobber */ "memory"); + RDTSC_START(cycles_start); + simdunpack(buffer, backdata,bit); + RDTSC_FINAL(cycles_final); + cycles_diff = (cycles_final - cycles_start); + if (cycles_diff < min_diff) min_diff = cycles_diff; + } + printf("%.2f\t",min_diff*1.0/length); + + free(data); + free(buffer); + free(backdata); + printf("\n"); + } + printf("\n\n"); /* two blank lines are required by gnuplot */ +} + +void demo128_d1() { + const uint32_t length = 128; + uint32_t bit; + printf("# --- %s\n", __func__); + printf("# compressing %d integers\n",length); + printf("# format: bit width, pack in cycles per int, unpack in cycles per int\n"); + for(bit = 1; bit <= 32; ++bit) { + uint32_t i; + + uint32_t * data = get_random_array_from_bit_width_d1(length, bit); + __m128i * buffer = malloc(length * sizeof(uint32_t)); + uint32_t * backdata = malloc(length * sizeof(uint32_t)); + uint32_t repeat = 500; + uint64_t min_diff; + printf("%d\t",bit); + min_diff = (uint64_t)-1; + for (i = 0; i < repeat; i++) { + uint64_t cycles_start, cycles_final, cycles_diff; + __asm volatile("" ::: /* pretend to clobber */ "memory"); + RDTSC_START(cycles_start); + simdpackwithoutmaskd1(0,data,buffer, bit); + RDTSC_FINAL(cycles_final); + cycles_diff = (cycles_final - cycles_start); + if (cycles_diff < min_diff) min_diff = cycles_diff; + } + printf("%.2f\t",min_diff*1.0/length); + min_diff = (uint64_t)-1; + for (i = 0; i < repeat; i++) { + uint64_t cycles_start, cycles_final, cycles_diff; + __asm volatile("" ::: /* pretend to clobber */ "memory"); + RDTSC_START(cycles_start); + simdunpackd1(0,buffer, backdata,bit); + RDTSC_FINAL(cycles_final); + cycles_diff = (cycles_final - cycles_start); + if (cycles_diff < min_diff) min_diff = cycles_diff; + } + printf("%.2f\t",min_diff*1.0/length); + + free(data); + free(buffer); + free(backdata); + printf("\n"); + } + printf("\n\n"); /* two blank lines are required by gnuplot */ +} + +#ifdef __AVX2__ +void demo256() { + const uint32_t length = 256; + uint32_t bit; + printf("# --- %s\n", __func__); + printf("# compressing %d integers\n",length); + printf("# format: bit width, pack in cycles per int, unpack in cycles per int\n"); + for(bit = 1; bit <= 32; ++bit) { + uint32_t i; + + uint32_t * data = get_random_array_from_bit_width(length, bit); + __m256i * buffer = malloc(length * sizeof(uint32_t)); + uint32_t * backdata = malloc(length * sizeof(uint32_t)); + uint32_t repeat = 500; + uint64_t min_diff; + printf("%d\t",bit); + min_diff = (uint64_t)-1; + for (i = 0; i < repeat; i++) { + uint64_t cycles_start, cycles_final, cycles_diff; + __asm volatile("" ::: /* pretend to clobber */ "memory"); + RDTSC_START(cycles_start); + avxpackwithoutmask(data,buffer, bit); + RDTSC_FINAL(cycles_final); + cycles_diff = (cycles_final - cycles_start); + if (cycles_diff < min_diff) min_diff = cycles_diff; + } + printf("%.2f\t",min_diff*1.0/length); + min_diff = (uint64_t)-1; + for (i = 0; i < repeat; i++) { + uint64_t cycles_start, cycles_final, cycles_diff; + __asm volatile("" ::: /* pretend to clobber */ "memory"); + RDTSC_START(cycles_start); + avxunpack(buffer, backdata,bit); + RDTSC_FINAL(cycles_final); + cycles_diff = (cycles_final - cycles_start); + if (cycles_diff < min_diff) min_diff = cycles_diff; + } + printf("%.2f\t",min_diff*1.0/length); + + free(data); + free(buffer); + free(backdata); + printf("\n"); + } + printf("\n\n"); /* two blank lines are required by gnuplot */ +} +#endif /* avx 2 */ + + +int main() { + demo128(); + demo128_d1(); +#ifdef __AVX2__ + demo256(); +#endif + return 0; + + +} diff --git a/example.c b/example.c new file mode 100644 index 000000000..1d68f95a7 --- /dev/null +++ b/example.c @@ -0,0 +1,195 @@ +/* Type "make example" to build this example program. */ +#include +#include +#include +#include "simdcomp.h" + +/** +We provide several different code examples. +**/ + + +/* very simple test to illustrate a simple application */ +int compress_decompress_demo() { + size_t k, N = 9999; + __m128i * endofbuf; + int howmanybytes; + float compratio; + uint32_t * datain = malloc(N * sizeof(uint32_t)); + uint8_t * buffer; + uint32_t * backbuffer = malloc(N * sizeof(uint32_t)); + uint32_t b; + printf("== simple test\n"); + + for (k = 0; k < N; ++k) { /* start with k=0, not k=1! */ + datain[k] = k; + } + + b = maxbits_length(datain, N); + buffer = malloc(simdpack_compressedbytes(N,b)); + endofbuf = simdpack_length(datain, N, (__m128i *)buffer, b); + howmanybytes = (endofbuf-(__m128i *)buffer)*sizeof(__m128i); /* number of compressed bytes */ + compratio = N*sizeof(uint32_t) * 1.0 / howmanybytes; + /* endofbuf points to the end of the compressed data */ + buffer = realloc(buffer,(endofbuf-(__m128i *)buffer)*sizeof(__m128i)); /* optional but safe. */ + printf("Compressed %d integers down to %d bytes (comp. ratio = %f).\n",(int)N,howmanybytes,compratio); + /* in actual applications b must be stored and retrieved: caller is responsible for that. */ + simdunpack_length((const __m128i *)buffer, N, backbuffer, b); /* will return a pointer to endofbuf */ + + for (k = 0; k < N; ++k) { + if(datain[k] != backbuffer[k]) { + printf("bug at %lu \n",(unsigned long)k); + return -1; + } + } + printf("Code works!\n"); + free(datain); + free(buffer); + free(backbuffer); + return 0; +} + + + +/* compresses data from datain to buffer, returns how many bytes written +used below in simple_demo */ +size_t compress(uint32_t * datain, size_t length, uint8_t * buffer) { + uint32_t offset; + uint8_t * initout; + size_t k; + if(length/SIMDBlockSize*SIMDBlockSize != length) { + printf("Data length should be a multiple of %i \n",SIMDBlockSize); + } + offset = 0; + initout = buffer; + for(k = 0; k < length / SIMDBlockSize; ++k) { + uint32_t b = simdmaxbitsd1(offset, + datain + k * SIMDBlockSize); + *buffer++ = b; + simdpackwithoutmaskd1(offset, datain + k * SIMDBlockSize, (__m128i *) buffer, + b); + offset = datain[k * SIMDBlockSize + SIMDBlockSize - 1]; + buffer += b * sizeof(__m128i); + } + return buffer - initout; +} + +/* Another illustration ... */ +void simple_demo() { + size_t REPEAT = 10, gap; + size_t N = 1000 * SIMDBlockSize;/* SIMDBlockSize is 128 */ + uint32_t * datain = malloc(N * sizeof(uint32_t)); + size_t compsize; + clock_t start, end; + uint8_t * buffer = malloc(N * sizeof(uint32_t) + N / SIMDBlockSize); /* output buffer */ + uint32_t * backbuffer = malloc(SIMDBlockSize * sizeof(uint32_t)); + printf("== simple demo\n"); + for (gap = 1; gap <= 243; gap *= 3) { + size_t k, repeat; + uint32_t offset = 0; + uint32_t bogus = 0; + double numberofseconds; + + printf("\n"); + printf(" gap = %lu \n", (unsigned long) gap); + datain[0] = 0; + for (k = 1; k < N; ++k) + datain[k] = datain[k-1] + ( rand() % (gap + 1) ); + compsize = compress(datain,N,buffer); + printf("compression ratio = %f \n", (N * sizeof(uint32_t))/ (compsize * 1.0 )); + start = clock(); + for(repeat = 0; repeat < REPEAT; ++repeat) { + uint8_t * decbuffer = buffer; + for (k = 0; k * SIMDBlockSize < N; ++k) { + uint8_t b = *decbuffer++; + simdunpackd1(offset, (__m128i *) decbuffer, backbuffer, b); + /* do something here with backbuffer */ + bogus += backbuffer[3]; + decbuffer += b * sizeof(__m128i); + offset = backbuffer[SIMDBlockSize - 1]; + } + } + end = clock(); + numberofseconds = (end-start)/(double)CLOCKS_PER_SEC; + printf("decoding speed in million of integers per second %f \n",N*REPEAT/(numberofseconds*1000.0*1000.0)); + start = clock(); + for(repeat = 0; repeat < REPEAT; ++repeat) { + uint8_t * decbuffer = buffer; + for (k = 0; k * SIMDBlockSize < N; ++k) { + memcpy(backbuffer,decbuffer+k*SIMDBlockSize,SIMDBlockSize*sizeof(uint32_t)); + bogus += backbuffer[3] - backbuffer[100]; + } + } + end = clock(); + numberofseconds = (end-start)/(double)CLOCKS_PER_SEC; + printf("memcpy speed in million of integers per second %f \n",N*REPEAT/(numberofseconds*1000.0*1000.0)); + printf("ignore me %i \n",bogus); + printf("All tests are in CPU cache. Avoid out-of-cache decoding in applications.\n"); + } + free(buffer); + free(datain); + free(backbuffer); +} + +/* Used below in more_sophisticated_demo ... */ +size_t varying_bit_width_compress(uint32_t * datain, size_t length, uint8_t * buffer) { + uint8_t * initout; + size_t k; + if(length/SIMDBlockSize*SIMDBlockSize != length) { + printf("Data length should be a multiple of %i \n",SIMDBlockSize); + } + initout = buffer; + for(k = 0; k < length / SIMDBlockSize; ++k) { + uint32_t b = maxbits(datain); + *buffer++ = b; + simdpackwithoutmask(datain, (__m128i *)buffer, b); + datain += SIMDBlockSize; + buffer += b * sizeof(__m128i); + } + return buffer - initout; +} + +/* Here we compress the data in blocks of 128 integers with varying bit width */ +int varying_bit_width_demo() { + size_t nn = 128 * 2; + uint32_t * datainn = malloc(nn * sizeof(uint32_t)); + uint8_t * buffern = malloc(nn * sizeof(uint32_t) + nn / SIMDBlockSize); + uint8_t * initbuffern = buffern; + uint32_t * backbuffern = malloc(nn * sizeof(uint32_t)); + size_t k, compsize; + printf("== varying bit-width demo\n"); + + for(k=0; k +*/ +import "C" +import "fmt" + +////////// +// For this demo, we pack and unpack blocks of 128 integers +///////// +func main() { + // I am going to use C types. Alternative might be to use unsafe.Pointer calls, see http://bit.ly/1ndw3W3 + // this is our original data + var data [128]C.uint32_t + for i := C.uint32_t(0); i < C.uint32_t(128); i++ { + data[i] = i + } + + + + + + //////////// + // We first pack without differential coding + /////////// + // computing how many bits per int. is needed + b := C.maxbits(&data[0]) + ratio := 32.0/float64(b) + fmt.Println("Bit width ", b) + fmt.Println(fmt.Sprintf("Compression ratio %f ", ratio)) + // we are now going to create a buffer to receive the packed data (each __m128i uses 128 bits) + out := make([] C.__m128i,b) + C.simdpackwithoutmask( &data[0],&out[0],b); + var recovereddata [128]C.uint32_t + C.simdunpack(&out[0],&recovereddata[0],b) + for i := 0; i < 128; i++ { + if data[i] != recovereddata[i] { + fmt.Println("Bug ") + return + } + } + + /////////// + // Next, we use differential coding + ////////// + offset := C.uint32_t(0) // if you pack data from K to K + 128, offset should be the value at K-1. When K = 0, choose a default + b1 := C.simdmaxbitsd1(offset,&data[0]) + ratio1 := 32.0/float64(b1) + fmt.Println("Bit width ", b1) + fmt.Println(fmt.Sprintf("Compression ratio %f ", ratio1)) + // we are now going to create a buffer to receive the packed data (each __m128i uses 128 bits) + out = make([] C.__m128i,b1) + C.simdpackwithoutmaskd1(offset, &data[0],&out[0],b1); + C.simdunpackd1(offset,&out[0],&recovereddata[0],b1) + for i := 0; i < 128; i++ { + if data[i] != recovereddata[i] { + fmt.Println("Bug ") + return + } + } + + fmt.Println("test succesful.") + +} diff --git a/include/avxbitpacking.h b/include/avxbitpacking.h new file mode 100644 index 000000000..00b68d622 --- /dev/null +++ b/include/avxbitpacking.h @@ -0,0 +1,40 @@ +/** + * This code is released under a BSD License. + */ + +#ifndef INCLUDE_AVXBITPACKING_H_ +#define INCLUDE_AVXBITPACKING_H_ + + +#ifdef __AVX2__ + +#include "portability.h" + + +/* AVX2 is required */ +#include +/* for memset */ +#include + +#include "simdcomputil.h" + +enum{ AVXBlockSize = 256}; + +/* max integer logarithm over a range of AVXBlockSize integers (256 integer) */ +uint32_t avxmaxbits(const uint32_t * begin); + +/* reads 256 values from "in", writes "bit" 256-bit vectors to "out" */ +void avxpack(const uint32_t * in,__m256i * out, const uint32_t bit); + +/* reads 256 values from "in", writes "bit" 256-bit vectors to "out" */ +void avxpackwithoutmask(const uint32_t * in,__m256i * out, const uint32_t bit); + +/* reads "bit" 256-bit vectors from "in", writes 256 values to "out" */ +void avxunpack(const __m256i * in,uint32_t * out, const uint32_t bit); + + + + +#endif /* __AVX2__ */ + +#endif /* INCLUDE_AVXBITPACKING_H_ */ diff --git a/include/portability.h b/include/portability.h new file mode 100644 index 000000000..f68f179f1 --- /dev/null +++ b/include/portability.h @@ -0,0 +1,81 @@ +/** + * This code is released under a BSD License. + */ +#ifndef SIMDBITCOMPAT_H_ +#define SIMDBITCOMPAT_H_ + +#include /* mostly for Microsoft compilers */ +#include + +#if SIMDCOMP_DEBUG +# define SIMDCOMP_ALWAYS_INLINE inline +# define SIMDCOMP_NEVER_INLINE +# define SIMDCOMP_PURE +#else +# if defined(__GNUC__) +# if __GNUC__ >= 3 +# define SIMDCOMP_ALWAYS_INLINE inline __attribute__((always_inline)) +# define SIMDCOMP_NEVER_INLINE __attribute__((noinline)) +# define SIMDCOMP_PURE __attribute__((pure)) +# else +# define SIMDCOMP_ALWAYS_INLINE inline +# define SIMDCOMP_NEVER_INLINE +# define SIMDCOMP_PURE +# endif +# elif defined(_MSC_VER) +# define SIMDCOMP_ALWAYS_INLINE __forceinline +# define SIMDCOMP_NEVER_INLINE +# define SIMDCOMP_PURE +# else +# if __has_attribute(always_inline) +# define SIMDCOMP_ALWAYS_INLINE inline __attribute__((always_inline)) +# else +# define SIMDCOMP_ALWAYS_INLINE inline +# endif +# if __has_attribute(noinline) +# define SIMDCOMP_NEVER_INLINE __attribute__((noinline)) +# else +# define SIMDCOMP_NEVER_INLINE +# endif +# if __has_attribute(pure) +# define SIMDCOMP_PURE __attribute__((pure)) +# else +# define SIMDCOMP_PURE +# endif +# endif +#endif + +#if defined(_MSC_VER) && _MSC_VER < 1600 +typedef unsigned int uint32_t; +typedef unsigned char uint8_t; +typedef signed char int8_t; +#else +#include /* part of Visual Studio 2010 and better, others likely anyway */ +#endif + +#if defined(_MSC_VER) +#define SIMDCOMP_ALIGNED(x) __declspec(align(x)) +#else +#if defined(__GNUC__) +#define SIMDCOMP_ALIGNED(x) __attribute__ ((aligned(x))) +#endif +#endif + +#if defined(_MSC_VER) +# include +/* 64-bit needs extending */ +# define SIMDCOMP_CTZ(result, mask) do { \ + unsigned long index; \ + if (!_BitScanForward(&(index), (mask))) { \ + (result) = 32U; \ + } else { \ + (result) = (uint32_t)(index); \ + } \ + } while (0) +#else +# define SIMDCOMP_CTZ(result, mask) \ + result = __builtin_ctz(mask) +#endif + +#endif /* SIMDBITCOMPAT_H_ */ + diff --git a/include/simdbitpacking.h b/include/simdbitpacking.h new file mode 100644 index 000000000..af32fd617 --- /dev/null +++ b/include/simdbitpacking.h @@ -0,0 +1,72 @@ +/** + * This code is released under a BSD License. + */ +#ifndef SIMDBITPACKING_H_ +#define SIMDBITPACKING_H_ + +#include "portability.h" + +/* SSE2 is required */ +#include +/* for memset */ +#include + +#include "simdcomputil.h" + +/*** +* Please see example.c for various examples on how to make good use +* of these functions. +*/ + + + +/* reads 128 values from "in", writes "bit" 128-bit vectors to "out". + * The input values are masked so that only the least significant "bit" bits are used. */ +void simdpack(const uint32_t * in,__m128i * out, const uint32_t bit); + +/* reads 128 values from "in", writes "bit" 128-bit vectors to "out". + * The input values are assumed to be less than 1< + + + + +/* returns the integer logarithm of v (bit width) */ +uint32_t bits(const uint32_t v); + +/* max integer logarithm over a range of SIMDBlockSize integers (128 integer) */ +uint32_t maxbits(const uint32_t * begin); + +/* same as maxbits, but we specify the number of integers */ +uint32_t maxbits_length(const uint32_t * in,uint32_t length); + +enum{ SIMDBlockSize = 128}; + + +/* computes (quickly) the minimal value of 128 values */ +uint32_t simdmin(const uint32_t * in); + +/* computes (quickly) the minimal value of the specified number of values */ +uint32_t simdmin_length(const uint32_t * in, uint32_t length); + +#ifdef __SSE4_1__ +/* computes (quickly) the minimal and maximal value of the specified number of values */ +void simdmaxmin_length(const uint32_t * in, uint32_t length, uint32_t * getmin, uint32_t * getmax); + +/* computes (quickly) the minimal and maximal value of the 128 values */ +void simdmaxmin(const uint32_t * in, uint32_t * getmin, uint32_t * getmax); + +#endif + +/* like maxbit over 128 integers (SIMDBlockSize) with provided initial value + and using differential coding */ +uint32_t simdmaxbitsd1(uint32_t initvalue, const uint32_t * in); + +/* like simdmaxbitsd1, but calculates maxbits over |length| integers + with provided initial value. |length| can be any arbitrary value. */ +uint32_t simdmaxbitsd1_length(uint32_t initvalue, const uint32_t * in, + uint32_t length); + + + +#endif /* SIMDCOMPUTIL_H_ */ diff --git a/include/simdfor.h b/include/simdfor.h new file mode 100644 index 000000000..39f7fbcac --- /dev/null +++ b/include/simdfor.h @@ -0,0 +1,72 @@ +/** + * This code is released under a BSD License. + */ +#ifndef INCLUDE_SIMDFOR_H_ +#define INCLUDE_SIMDFOR_H_ + +#include "portability.h" + +/* SSE2 is required */ +#include + +#include "simdcomputil.h" +#include "simdbitpacking.h" + +#ifdef __cplusplus +extern "C" { +#endif + +/* reads 128 values from "in", writes "bit" 128-bit vectors to "out" */ +void simdpackFOR(uint32_t initvalue, const uint32_t * in,__m128i * out, const uint32_t bit); + + +/* reads "bit" 128-bit vectors from "in", writes 128 values to "out" */ +void simdunpackFOR(uint32_t initvalue, const __m128i * in,uint32_t * out, const uint32_t bit); + + +/* how many compressed bytes are needed to compressed length integers using a bit width of bit with +the simdpackFOR_length function. */ +int simdpackFOR_compressedbytes(int length, const uint32_t bit); + +/* like simdpackFOR, but supports an undetermined number of inputs. +This is useful if you need to pack less than 128 integers. Note that this function is much slower. + Compressed data is stored in the memory location between + the provided (out) pointer and the returned pointer. */ +__m128i * simdpackFOR_length(uint32_t initvalue, const uint32_t * in, int length, __m128i * out, const uint32_t bit); + +/* like simdunpackFOR, but supports an undetermined number of inputs. +This is useful if you need to unpack less than 128 integers. Note that this function is much slower. + The read compressed data is between the provided + (in) pointer and the returned pointer. */ +const __m128i * simdunpackFOR_length(uint32_t initvalue, const __m128i * in, int length, uint32_t * out, const uint32_t bit); + + +/* returns the value stored at the specified "slot". +* */ +uint32_t simdselectFOR(uint32_t initvalue, const __m128i *in, uint32_t bit, + int slot); + +/* given a block of 128 packed values, this function sets the value at index "index" to "value" */ +void simdfastsetFOR(uint32_t initvalue, __m128i * in, uint32_t bit, uint32_t value, size_t index); + + +/* searches "bit" 128-bit vectors from "in" (= length<=128 encoded integers) for the first encoded uint32 value + * which is >= |key|, and returns its position. It is assumed that the values + * stored are in sorted order. + * The encoded key is stored in "*presult". + * The first length decoded integers, ignoring others. If no value is larger or equal to the key, + * length is returned. Length should be no larger than 128. + * + * If no value is larger or equal to the key, +* length is returned */ +int simdsearchwithlengthFOR(uint32_t initvalue, const __m128i *in, uint32_t bit, + int length, uint32_t key, uint32_t *presult); + +#ifdef __cplusplus +} // extern "C" +#endif + + + + +#endif /* INCLUDE_SIMDFOR_H_ */ diff --git a/include/simdintegratedbitpacking.h b/include/simdintegratedbitpacking.h new file mode 100644 index 000000000..92f44a23e --- /dev/null +++ b/include/simdintegratedbitpacking.h @@ -0,0 +1,98 @@ +/** + * This code is released under a BSD License. + */ + +#ifndef SIMD_INTEGRATED_BITPACKING_H +#define SIMD_INTEGRATED_BITPACKING_H + +#include "portability.h" + +/* SSE2 is required */ +#include + +#include "simdcomputil.h" +#include "simdbitpacking.h" + +#ifdef __cplusplus +extern "C" { +#endif + +/* reads 128 values from "in", writes "bit" 128-bit vectors to "out" + integer values should be in sorted order (for best results). + The differences are masked so that only the least significant "bit" bits are used. */ +void simdpackd1(uint32_t initvalue, const uint32_t * in,__m128i * out, const uint32_t bit); + + +/* reads 128 values from "in", writes "bit" 128-bit vectors to "out" + integer values should be in sorted order (for best results). + The difference values are assumed to be less than 1<= |key|, and returns its position. It is assumed that the values + * stored are in sorted order. + * The encoded key is stored in "*presult". If no value is larger or equal to the key, +* 128 is returned. The pointer initOffset is a pointer to the last four value decoded +* (when starting out, this can be a zero vector or initialized with _mm_set1_epi32(init)), +* and the vector gets updated. +**/ +int +simdsearchd1(__m128i * initOffset, const __m128i *in, uint32_t bit, + uint32_t key, uint32_t *presult); + + +/* searches "bit" 128-bit vectors from "in" (= length<=128 encoded integers) for the first encoded uint32 value + * which is >= |key|, and returns its position. It is assumed that the values + * stored are in sorted order. + * The encoded key is stored in "*presult". + * The first length decoded integers, ignoring others. If no value is larger or equal to the key, + * length is returned. Length should be no larger than 128. + * + * If no value is larger or equal to the key, +* length is returned */ +int simdsearchwithlengthd1(uint32_t initvalue, const __m128i *in, uint32_t bit, + int length, uint32_t key, uint32_t *presult); + + + +/* returns the value stored at the specified "slot". +* */ +uint32_t simdselectd1(uint32_t initvalue, const __m128i *in, uint32_t bit, + int slot); + +/* given a block of 128 packed values, this function sets the value at index "index" to "value", + * you must somehow know the previous value. + * Because of differential coding, all following values are incremented by the offset between this new + * value and the old value... + * This functions is useful if you want to modify the last value. + */ +void simdfastsetd1fromprevious( __m128i * in, uint32_t bit, uint32_t previousvalue, uint32_t value, size_t index); + +/* given a block of 128 packed values, this function sets the value at index "index" to "value", + * This function computes the previous value if needed. + * Because of differential coding, all following values are incremented by the offset between this new + * value and the old value... + * This functions is useful if you want to modify the last value. + */ +void simdfastsetd1(uint32_t initvalue, __m128i * in, uint32_t bit, uint32_t value, size_t index); + + +/*Simply scan the data +* The pointer initOffset is a pointer to the last four value decoded +* (when starting out, this can be a zero vector or initialized with _mm_set1_epi32(init);), +* and the vector gets updated. +* */ + +void +simdscand1(__m128i * initOffset, const __m128i *in, uint32_t bit); + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif diff --git a/makefile b/makefile new file mode 100644 index 000000000..b022de413 --- /dev/null +++ b/makefile @@ -0,0 +1,79 @@ +# minimalist makefile +.SUFFIXES: +# +.SUFFIXES: .cpp .o .c .h +ifeq ($(DEBUG),1) +CFLAGS = -fPIC -std=c89 -ggdb -msse4.1 -march=native -Wall -Wextra -Wshadow -fsanitize=undefined -fno-omit-frame-pointer -fsanitize=address +else +CFLAGS = -fPIC -std=c89 -O3 -msse4.1 -march=native -Wall -Wextra -Wshadow +endif # debug +LDFLAGS = -shared +LIBNAME=libsimdcomp.so.0.0.3 +all: unit unit_chars bitpackingbenchmark $(LIBNAME) +test: + ./unit + ./unit_chars +install: $(OBJECTS) + cp $(LIBNAME) /usr/local/lib + ln -s /usr/local/lib/$(LIBNAME) /usr/local/lib/libsimdcomp.so + ldconfig + cp $(HEADERS) /usr/local/include + + + +HEADERS=./include/simdbitpacking.h ./include/simdcomputil.h ./include/simdintegratedbitpacking.h ./include/simdcomp.h ./include/simdfor.h ./include/avxbitpacking.h + +uninstall: + for h in $(HEADERS) ; do rm /usr/local/$$h; done + rm /usr/local/lib/$(LIBNAME) + rm /usr/local/lib/libsimdcomp.so + ldconfig + + +OBJECTS= simdbitpacking.o simdintegratedbitpacking.o simdcomputil.o \ + simdpackedsearch.o simdpackedselect.o simdfor.o avxbitpacking.o + +$(LIBNAME): $(OBJECTS) + $(CC) $(CFLAGS) -o $(LIBNAME) $(OBJECTS) $(LDFLAGS) + + +avxbitpacking.o: ./src/avxbitpacking.c $(HEADERS) + $(CC) $(CFLAGS) -c ./src/avxbitpacking.c -Iinclude + + +simdfor.o: ./src/simdfor.c $(HEADERS) + $(CC) $(CFLAGS) -c ./src/simdfor.c -Iinclude + + +simdcomputil.o: ./src/simdcomputil.c $(HEADERS) + $(CC) $(CFLAGS) -c ./src/simdcomputil.c -Iinclude + +simdbitpacking.o: ./src/simdbitpacking.c $(HEADERS) + $(CC) $(CFLAGS) -c ./src/simdbitpacking.c -Iinclude + +simdintegratedbitpacking.o: ./src/simdintegratedbitpacking.c $(HEADERS) + $(CC) $(CFLAGS) -c ./src/simdintegratedbitpacking.c -Iinclude + +simdpackedsearch.o: ./src/simdpackedsearch.c $(HEADERS) + $(CC) $(CFLAGS) -c ./src/simdpackedsearch.c -Iinclude + +simdpackedselect.o: ./src/simdpackedselect.c $(HEADERS) + $(CC) $(CFLAGS) -c ./src/simdpackedselect.c -Iinclude + +example: ./example.c $(HEADERS) $(OBJECTS) + $(CC) $(CFLAGS) -o example ./example.c -Iinclude $(OBJECTS) + +unit: ./tests/unit.c $(HEADERS) $(OBJECTS) + $(CC) $(CFLAGS) -o unit ./tests/unit.c -Iinclude $(OBJECTS) + +bitpackingbenchmark: ./benchmarks/bitpackingbenchmark.c $(HEADERS) $(OBJECTS) + $(CC) $(CFLAGS) -o bitpackingbenchmark ./benchmarks/bitpackingbenchmark.c -Iinclude $(OBJECTS) +benchmark: ./benchmarks/benchmark.c $(HEADERS) $(OBJECTS) + $(CC) $(CFLAGS) -o benchmark ./benchmarks/benchmark.c -Iinclude $(OBJECTS) +dynunit: ./tests/unit.c $(HEADERS) $(LIBNAME) + $(CC) $(CFLAGS) -o dynunit ./tests/unit.c -Iinclude -lsimdcomp + +unit_chars: ./tests/unit_chars.c $(HEADERS) $(OBJECTS) + $(CC) $(CFLAGS) -o unit_chars ./tests/unit_chars.c -Iinclude $(OBJECTS) +clean: + rm -f unit *.o $(LIBNAME) example benchmark bitpackingbenchmark dynunit unit_chars diff --git a/makefile.vc b/makefile.vc new file mode 100644 index 000000000..cb631c272 --- /dev/null +++ b/makefile.vc @@ -0,0 +1,104 @@ + +!IFNDEF MACHINE +!IF "$(PROCESSOR_ARCHITECTURE)"=="AMD64" +MACHINE=x64 +!ELSE +MACHINE=x86 +!ENDIF +!ENDIF + +!IFNDEF DEBUG +DEBUG=no +!ENDIF + +!IFNDEF CC +CC=cl.exe +!ENDIF + +!IFNDEF AR +AR=lib.exe +!ENDIF + +!IFNDEF LINK +LINK=link.exe +!ENDIF + +!IFNDEF PGO +PGO=no +!ENDIF + +!IFNDEF PGI +PGI=no +!ENDIF + +INC = /Iinclude + +!IF "$(DEBUG)"=="yes" +CFLAGS = /nologo /MDd /LDd /Od /Zi /D_DEBUG /RTC1 /W3 /GS /Gm +ARFLAGS = /nologo +LDFLAGS = /nologo /debug /nodefaultlib:msvcrt +!ELSE +CFLAGS = /nologo /MD /O2 /Zi /DNDEBUG /W3 /Gm- /GS /Gy /Oi /GL /MP +ARFLAGS = /nologo /LTCG +LDFLAGS = /nologo /LTCG /DYNAMICBASE /incremental:no /debug /opt:ref,icf +!ENDIF + +!IF "$(PGI)"=="yes" +LDFLAGS = $(LDFLAGS) /ltcg:pgi +!ENDIF + +!IF "$(PGO)"=="yes" +LDFLAGS = $(LDFLAGS) /ltcg:pgo +!ENDIF + +LIB_OBJS = simdbitpacking.obj simdintegratedbitpacking.obj simdcomputil.obj \ + simdpackedsearch.obj simdpackedselect.obj simdfor.obj + + +all: lib dll dynunit unit_chars example benchmark +# need some good use case scenario to train the instrumented build + @if "$(PGI)"=="yes" echo Running PGO training + @if "$(PGI)"=="yes" benchmark.exe >nul 2>&1 + @if "$(PGI)"=="yes" example.exe >nul 2>&1 + + +$(LIB_OBJS): + $(CC) $(INC) $(CFLAGS) /c src/simdbitpacking.c src/simdintegratedbitpacking.c src/simdcomputil.c \ + src/simdpackedsearch.c src/simdpackedselect.c src/simdfor.c + +lib: $(LIB_OBJS) + $(AR) $(ARFLAGS) /OUT:simdcomp_a.lib $(LIB_OBJS) + +dll: $(LIB_OBJS) + $(LINK) /DLL $(LDFLAGS) /OUT:simdcomp.dll /IMPLIB:simdcomp.lib /DEF:simdcomp.def $(LIB_OBJS) + +unit: lib + $(CC) $(INC) $(CFLAGS) /c src/unit.c + $(LINK) $(LDFLAGS) /OUT:unit.exe unit.obj simdcomp_a.lib + +dynunit: dll + $(CC) $(INC) $(CFLAGS) /c src/unit.c + $(LINK) $(LDFLAGS) /OUT:unit.exe unit.obj simdcomp.lib + +unit_chars: lib + $(CC) $(INC) $(CFLAGS) /c src/unit_chars.c + $(LINK) $(LDFLAGS) /OUT:unit_chars.exe unit_chars.obj simdcomp.lib + + +example: lib + $(CC) $(INC) $(CFLAGS) /c example.c + $(LINK) $(LDFLAGS) /OUT:example.exe example.obj simdcomp.lib + +benchmark: lib + $(CC) $(INC) $(CFLAGS) /c src/benchmark.c + $(LINK) $(LDFLAGS) /OUT:benchmark.exe benchmark.obj simdcomp.lib + +clean: + del /Q *.obj + del /Q *.lib + del /Q *.exe + del /Q *.dll + del /Q *.pgc + del /Q *.pgd + del /Q *.pdb + diff --git a/package.json b/package.json new file mode 100644 index 000000000..a91dd24a4 --- /dev/null +++ b/package.json @@ -0,0 +1,16 @@ +{ + "name": "simdcomp", + "version": "0.0.3", + "repo": "lemire/simdcomp", + "description": "A simple C library for compressing lists of integers", + "license": "BSD-3-Clause", + "src": [ + "src/simdbitpacking.c", + "src/simdcomputil.c", + "src/simdintegratedbitpacking.c", + "include/simdbitpacking.h", + "include/simdcomp.h", + "include/simdcomputil.h", + "include/simdintegratedbitpacking.h" + ] +} diff --git a/scripts/avxpacking.py b/scripts/avxpacking.py new file mode 100755 index 000000000..81d1ac097 --- /dev/null +++ b/scripts/avxpacking.py @@ -0,0 +1,182 @@ +#!/usr/bin/env python +import sys +def howmany(bit): + """ how many values are we going to pack? """ + return 256 + +def howmanywords(bit): + return (howmany(bit) * bit + 255)/256 + +def howmanybytes(bit): + return howmanywords(bit) * 16 + +print(""" +/** code generated by avxpacking.py starts here **/ +""") + +print("""typedef void (*avxpackblockfnc)(const uint32_t * pin, __m256i * compressed);""") +print("""typedef void (*avxunpackblockfnc)(const __m256i * compressed, uint32_t * pout);""") + + + + + + +def plurial(number): + if(number <> 1): + return "s" + else : + return "" + +print("") +print("static void avxpackblock0(const uint32_t * pin, __m256i * compressed) {"); +print(" (void)compressed;"); +print(" (void) pin; /* we consumed {0} 32-bit integer{1} */ ".format(howmany(0),plurial(howmany(0)))); +print("}"); +print("") + +for bit in range(1,33): + print("") + print("/* we are going to pack {0} {1}-bit values, touching {2} 256-bit words, using {3} bytes */ ".format(howmany(bit),bit,howmanywords(bit),howmanybytes(bit))) + print("static void avxpackblock{0}(const uint32_t * pin, __m256i * compressed) {{".format(bit)); + print(" const __m256i * in = (const __m256i *) pin;"); + print(" /* we are going to touch {0} 256-bit word{1} */ ".format(howmanywords(bit),plurial(howmanywords(bit)))); + if(howmanywords(bit) == 1): + print(" __m256i w0;") + else: + print(" __m256i w0, w1;") + if( (bit & (bit-1)) <> 0) : print(" __m256i tmp; /* used to store inputs at word boundary */") + oldword = 0 + for j in range(howmany(bit)/8): + firstword = j * bit / 32 + if(firstword > oldword): + print(" _mm256_storeu_si256(compressed + {0}, w{1});".format(oldword,oldword%2)) + oldword = firstword + secondword = (j * bit + bit - 1)/32 + firstshift = (j*bit) % 32 + if( firstword == secondword): + if(firstshift == 0): + print(" w{0} = _mm256_lddqu_si256 (in + {1});".format(firstword%2,j)) + else: + print(" w{0} = _mm256_or_si256(w{0},_mm256_slli_epi32(_mm256_lddqu_si256 (in + {1}) , {2}));".format(firstword%2,j,firstshift)) + else: + print(" tmp = _mm256_lddqu_si256 (in + {0});".format(j)) + print(" w{0} = _mm256_or_si256(w{0},_mm256_slli_epi32(tmp , {2}));".format(firstword%2,j,firstshift)) + secondshift = 32-firstshift + print(" w{0} = _mm256_srli_epi32(tmp,{2});".format(secondword%2,j,secondshift)) + print(" _mm256_storeu_si256(compressed + {0}, w{1});".format(secondword,secondword%2)) + print("}"); + print("") + + +print("") +print("static void avxpackblockmask0(const uint32_t * pin, __m256i * compressed) {"); +print(" (void)compressed;"); +print(" (void) pin; /* we consumed {0} 32-bit integer{1} */ ".format(howmany(0),plurial(howmany(0)))); +print("}"); +print("") + +for bit in range(1,33): + print("") + print("/* we are going to pack {0} {1}-bit values, touching {2} 256-bit words, using {3} bytes */ ".format(howmany(bit),bit,howmanywords(bit),howmanybytes(bit))) + print("static void avxpackblockmask{0}(const uint32_t * pin, __m256i * compressed) {{".format(bit)); + print(" /* we are going to touch {0} 256-bit word{1} */ ".format(howmanywords(bit),plurial(howmanywords(bit)))); + if(howmanywords(bit) == 1): + print(" __m256i w0;") + else: + print(" __m256i w0, w1;") + print(" const __m256i * in = (const __m256i *) pin;"); + if(bit < 32): print(" const __m256i mask = _mm256_set1_epi32({0});".format((1< 0) : print(" __m256i tmp; /* used to store inputs at word boundary */") + oldword = 0 + for j in range(howmany(bit)/8): + firstword = j * bit / 32 + if(firstword > oldword): + print(" _mm256_storeu_si256(compressed + {0}, w{1});".format(oldword,oldword%2)) + oldword = firstword + secondword = (j * bit + bit - 1)/32 + firstshift = (j*bit) % 32 + loadstr = maskfnc(" _mm256_lddqu_si256 (in + {0}) ".format(j)) + if( firstword == secondword): + if(firstshift == 0): + print(" w{0} = {1};".format(firstword%2,loadstr)) + else: + print(" w{0} = _mm256_or_si256(w{0},_mm256_slli_epi32({1} , {2}));".format(firstword%2,loadstr,firstshift)) + else: + print(" tmp = {0};".format(loadstr)) + print(" w{0} = _mm256_or_si256(w{0},_mm256_slli_epi32(tmp , {2}));".format(firstword%2,j,firstshift)) + secondshift = 32-firstshift + print(" w{0} = _mm256_srli_epi32(tmp,{2});".format(secondword%2,j,secondshift)) + print(" _mm256_storeu_si256(compressed + {0}, w{1});".format(secondword,secondword%2)) + print("}"); + print("") + + +print("static void avxunpackblock0(const __m256i * compressed, uint32_t * pout) {"); +print(" (void) compressed;"); +print(" memset(pout,0,{0});".format(howmany(0))); +print("}"); +print("") + +for bit in range(1,33): + print("") + print("/* we packed {0} {1}-bit values, touching {2} 256-bit words, using {3} bytes */ ".format(howmany(bit),bit,howmanywords(bit),howmanybytes(bit))) + print("static void avxunpackblock{0}(const __m256i * compressed, uint32_t * pout) {{".format(bit)); + print(" /* we are going to access {0} 256-bit word{1} */ ".format(howmanywords(bit),plurial(howmanywords(bit)))); + if(howmanywords(bit) == 1): + print(" __m256i w0;") + else: + print(" __m256i w0, w1;") + print(" __m256i * out = (__m256i *) pout;"); + if(bit < 32): print(" const __m256i mask = _mm256_set1_epi32({0});".format((1< oldword): + print(" w{0} = _mm256_lddqu_si256 (compressed + {1});".format(secondword%2,secondword)) + oldword = secondword + firstshift = (j*bit) % 32 + firstshiftstr = "_mm256_srli_epi32( w{0} , "+str(firstshift)+") " + if(firstshift == 0): + firstshiftstr =" w{0} " # no need + wfirst = firstshiftstr.format(firstword%2) + if( firstword == secondword): + if(firstshift + bit <> 32): + wfirst = maskstr.format(wfirst) + print(" _mm256_storeu_si256(out + {0}, {1});".format(j,wfirst)) + else: + secondshift = (32-firstshift) + wsecond = "_mm256_slli_epi32( w{0} , {1} ) ".format((firstword+1)%2,secondshift) + wfirstorsecond = " _mm256_or_si256 ({0},{1}) ".format(wfirst,wsecond) + wfirstorsecond = maskstr.format(wfirstorsecond) + print(" _mm256_storeu_si256(out + {0},\n {1});".format(j,wfirstorsecond)) + print("}"); + print("") + + +print("static avxpackblockfnc avxfuncPackArr[] = {") +for bit in range(0,32): + print("&avxpackblock{0},".format(bit)) +print("&avxpackblock32") +print("};") + +print("static avxpackblockfnc avxfuncPackMaskArr[] = {") +for bit in range(0,32): + print("&avxpackblockmask{0},".format(bit)) +print("&avxpackblockmask32") +print("};") + + +print("static avxunpackblockfnc avxfuncUnpackArr[] = {") +for bit in range(0,32): + print("&avxunpackblock{0},".format(bit)) +print("&avxunpackblock32") +print("};") +print("/** code generated by avxpacking.py ends here **/") diff --git a/scripts/simdfor.py b/scripts/simdfor.py new file mode 100755 index 000000000..c60db1e89 --- /dev/null +++ b/scripts/simdfor.py @@ -0,0 +1,152 @@ +#!/usr/bin/env python3 + + +from math import ceil + +print(""" +/** +* Blablabla +* +*/ + +"""); + +def mask(bit): + return str((1 << bit) - 1) + +for length in [32]: + print(""" +static __m128i iunpackFOR0(__m128i initOffset, const __m128i * _in , uint32_t * _out) { + __m128i *out = (__m128i*)(_out); + int i; + (void) _in; + for (i = 0; i < 8; ++i) { + _mm_store_si128(out++, initOffset); + _mm_store_si128(out++, initOffset); + _mm_store_si128(out++, initOffset); + _mm_store_si128(out++, initOffset); + } + + return initOffset; +} + + """) + print(""" + +static void ipackFOR0(__m128i initOffset , const uint32_t * _in , __m128i * out ) { + (void) initOffset; + (void) _in; + (void) out; +} +""") + for bit in range(1,33): + offsetVar = " initOffset"; + print(""" +static void ipackFOR"""+str(bit)+"""(__m128i """+offsetVar+""", const uint32_t * _in, __m128i * out) { + const __m128i *in = (const __m128i*)(_in); + __m128i OutReg; + + """); + + if (bit != 32): + print(" __m128i CurrIn = _mm_load_si128(in);"); + print(" __m128i InReg = _mm_sub_epi32(CurrIn, initOffset);"); + else: + print(" __m128i InReg = _mm_load_si128(in);"); + print(" (void) initOffset;"); + + + inwordpointer = 0 + valuecounter = 0 + for k in range(ceil((length * bit) / 32)): + if(valuecounter == length): break + for x in range(inwordpointer,32,bit): + if(x!=0) : + print(" OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, " + str(x) + "));"); + else: + print(" OutReg = InReg; "); + if((x+bit>=32) ): + while(inwordpointer<32): + inwordpointer += bit + print(" _mm_store_si128(out, OutReg);"); + print(""); + + if(valuecounter + 1 < length): + print(" ++out;") + inwordpointer -= 32; + if(inwordpointer>0): + print(" OutReg = _mm_srli_epi32(InReg, " + str(bit) + " - " + str(inwordpointer) + ");"); + if(valuecounter + 1 < length): + print(" ++in;") + + if (bit != 32): + print(" CurrIn = _mm_load_si128(in);"); + print(" InReg = _mm_sub_epi32(CurrIn, initOffset);"); + else: + print(" InReg = _mm_load_si128(in);"); + print(""); + valuecounter = valuecounter + 1 + if(valuecounter == length): break + assert(valuecounter == length) + print("\n}\n\n""") + + for bit in range(1,32): + offsetVar = " initOffset"; + print("""\n +static __m128i iunpackFOR"""+str(bit)+"""(__m128i """+offsetVar+""", const __m128i* in, uint32_t * _out) { + """); + print(""" __m128i* out = (__m128i*)(_out); + __m128i InReg = _mm_load_si128(in); + __m128i OutReg; + __m128i tmp; + const __m128i mask = _mm_set1_epi32((1U<<"""+str(bit)+""")-1); + + """); + + MainText = ""; + + MainText += "\n"; + inwordpointer = 0 + valuecounter = 0 + for k in range(ceil((length * bit) / 32)): + for x in range(inwordpointer,32,bit): + if(valuecounter == length): break + if (x > 0): + MainText += " tmp = _mm_srli_epi32(InReg," + str(x) +");\n"; + else: + MainText += " tmp = InReg;\n"; + if(x+bit<32): + MainText += " OutReg = _mm_and_si128(tmp, mask);\n"; + else: + MainText += " OutReg = tmp;\n"; + if((x+bit>=32) ): + while(inwordpointer<32): + inwordpointer += bit + if(valuecounter + 1 < length): + MainText += " ++in;" + MainText += " InReg = _mm_load_si128(in);\n"; + inwordpointer -= 32; + if(inwordpointer>0): + MainText += " OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, " + str(bit) + "-" + str(inwordpointer) + "), mask));\n\n"; + if (bit != 32): + MainText += " OutReg = _mm_add_epi32(OutReg, initOffset);\n"; + MainText += " _mm_store_si128(out++, OutReg);\n\n"; + MainText += ""; + valuecounter = valuecounter + 1 + if(valuecounter == length): break + assert(valuecounter == length) + print(MainText) + print(" return initOffset;"); + print("\n}\n\n") + print(""" +static __m128i iunpackFOR32(__m128i initvalue , const __m128i* in, uint32_t * _out) { + __m128i * mout = (__m128i *)_out; + __m128i invec; + size_t k; + for(k = 0; k < 128/4; ++k) { + invec = _mm_load_si128(in++); + _mm_store_si128(mout++, invec); + } + return invec; +} + """) diff --git a/simdcomp.def b/simdcomp.def new file mode 100644 index 000000000..343e148c9 --- /dev/null +++ b/simdcomp.def @@ -0,0 +1,40 @@ +EXPORTS + simdpack + simdpackwithoutmask + simdunpack + bits + maxbits + maxbits_length + simdmin + simdmin_length + simdmaxmin + simdmaxmin_length + simdmaxbitsd1 + simdmaxbitsd1_length + simdpackd1 + simdpackwithoutmaskd1 + simdunpackd1 + simdsearchd1 + simdsearchwithlengthd1 + simdselectd1 + simdpackFOR + simdselectFOR + simdsearchwithlengthFOR + simdunpackFOR + simdmin_length + simdmaxmin + simdmaxmin_length + simdpack_length + simdpackFOR_length + simdunpackFOR_length + simdpack_shortlength + simdfastsetFOR + simdfastset + simdfastsetd1 + simdunpack_length + simdunpack_shortlength + simdsearchwithlengthFOR + simdscand1 + simdfastsetd1fromprevious + simdfastsetd1 + diff --git a/src/avxbitpacking.c b/src/avxbitpacking.c new file mode 100644 index 000000000..f592e3687 --- /dev/null +++ b/src/avxbitpacking.c @@ -0,0 +1,7795 @@ +#include "avxbitpacking.h" +#ifdef __AVX2__ + + +static uint32_t maxbitas32int(const __m256i accumulator) { + const __m256i _tmp1 = _mm256_or_si256(_mm256_srli_si256(accumulator, 8), accumulator); /* (A,B,C,D) xor (0,0,A,B) = (A,B,C xor A,D xor B)*/ + const __m256i _tmp2 = _mm256_or_si256(_mm256_srli_si256(_tmp1, 4), _tmp1); /* (A,B,C xor A,D xor B) xor (0,0,0,C xor A)*/ + uint32_t ans1 = _mm256_extract_epi32(_tmp2,0); + uint32_t ans2 = _mm256_extract_epi32(_tmp2,4); + uint32_t ans = ans1 > ans2 ? ans1 : ans2; + return bits(ans); +} + +uint32_t avxmaxbits(const uint32_t * begin) { + const __m256i* pin = (const __m256i*)(begin); + __m256i accumulator = _mm256_lddqu_si256(pin); + uint32_t k = 1; + for(; 8*k < AVXBlockSize; ++k) { + __m256i newvec = _mm256_lddqu_si256(pin+k); + accumulator = _mm256_or_si256(accumulator,newvec); + } + return maxbitas32int(accumulator); +} + + +/** code generated by avxpacking.py starts here **/ + +typedef void (*avxpackblockfnc)(const uint32_t * pin, __m256i * compressed); +typedef void (*avxunpackblockfnc)(const __m256i * compressed, uint32_t * pout); + +static void avxpackblock0(const uint32_t * pin, __m256i * compressed) { + (void)compressed; + (void) pin; /* we consumed 256 32-bit integers */ +} + + +/* we are going to pack 256 1-bit values, touching 1 256-bit words, using 16 bytes */ +static void avxpackblock1(const uint32_t * pin, __m256i * compressed) { + const __m256i * in = (const __m256i *) pin; + /* we are going to touch 1 256-bit word */ + __m256i w0; + w0 = _mm256_lddqu_si256 (in + 0); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 1) , 1)); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 2) , 2)); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 3) , 3)); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 4) , 4)); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 5) , 5)); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 6) , 6)); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 7) , 7)); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 8) , 8)); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 9) , 9)); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 10) , 10)); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 11) , 11)); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 12) , 12)); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 13) , 13)); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 14) , 14)); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 15) , 15)); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 16) , 16)); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 17) , 17)); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 18) , 18)); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 19) , 19)); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 20) , 20)); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 21) , 21)); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 22) , 22)); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 23) , 23)); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 24) , 24)); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 25) , 25)); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 26) , 26)); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 27) , 27)); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 28) , 28)); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 29) , 29)); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 30) , 30)); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 31) , 31)); + _mm256_storeu_si256(compressed + 0, w0); +} + + +/* we are going to pack 256 2-bit values, touching 2 256-bit words, using 32 bytes */ +static void avxpackblock2(const uint32_t * pin, __m256i * compressed) { + const __m256i * in = (const __m256i *) pin; + /* we are going to touch 2 256-bit words */ + __m256i w0, w1; + w0 = _mm256_lddqu_si256 (in + 0); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 1) , 2)); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 2) , 4)); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 3) , 6)); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 4) , 8)); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 5) , 10)); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 6) , 12)); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 7) , 14)); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 8) , 16)); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 9) , 18)); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 10) , 20)); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 11) , 22)); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 12) , 24)); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 13) , 26)); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 14) , 28)); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 15) , 30)); + _mm256_storeu_si256(compressed + 0, w0); + w1 = _mm256_lddqu_si256 (in + 16); + w1 = _mm256_or_si256(w1,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 17) , 2)); + w1 = _mm256_or_si256(w1,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 18) , 4)); + w1 = _mm256_or_si256(w1,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 19) , 6)); + w1 = _mm256_or_si256(w1,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 20) , 8)); + w1 = _mm256_or_si256(w1,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 21) , 10)); + w1 = _mm256_or_si256(w1,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 22) , 12)); + w1 = _mm256_or_si256(w1,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 23) , 14)); + w1 = _mm256_or_si256(w1,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 24) , 16)); + w1 = _mm256_or_si256(w1,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 25) , 18)); + w1 = _mm256_or_si256(w1,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 26) , 20)); + w1 = _mm256_or_si256(w1,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 27) , 22)); + w1 = _mm256_or_si256(w1,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 28) , 24)); + w1 = _mm256_or_si256(w1,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 29) , 26)); + w1 = _mm256_or_si256(w1,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 30) , 28)); + w1 = _mm256_or_si256(w1,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 31) , 30)); + _mm256_storeu_si256(compressed + 1, w1); +} + + +/* we are going to pack 256 3-bit values, touching 3 256-bit words, using 48 bytes */ +static void avxpackblock3(const uint32_t * pin, __m256i * compressed) { + const __m256i * in = (const __m256i *) pin; + /* we are going to touch 3 256-bit words */ + __m256i w0, w1; + __m256i tmp; /* used to store inputs at word boundary */ + w0 = _mm256_lddqu_si256 (in + 0); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 1) , 3)); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 2) , 6)); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 3) , 9)); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 4) , 12)); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 5) , 15)); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 6) , 18)); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 7) , 21)); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 8) , 24)); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 9) , 27)); + tmp = _mm256_lddqu_si256 (in + 10); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 30)); + w1 = _mm256_srli_epi32(tmp,2); + _mm256_storeu_si256(compressed + 0, w0); + w1 = _mm256_or_si256(w1,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 11) , 1)); + w1 = _mm256_or_si256(w1,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 12) , 4)); + w1 = _mm256_or_si256(w1,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 13) , 7)); + w1 = _mm256_or_si256(w1,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 14) , 10)); + w1 = _mm256_or_si256(w1,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 15) , 13)); + w1 = _mm256_or_si256(w1,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 16) , 16)); + w1 = _mm256_or_si256(w1,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 17) , 19)); + w1 = _mm256_or_si256(w1,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 18) , 22)); + w1 = _mm256_or_si256(w1,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 19) , 25)); + w1 = _mm256_or_si256(w1,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 20) , 28)); + tmp = _mm256_lddqu_si256 (in + 21); + w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 31)); + w0 = _mm256_srli_epi32(tmp,1); + _mm256_storeu_si256(compressed + 1, w1); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 22) , 2)); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 23) , 5)); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 24) , 8)); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 25) , 11)); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 26) , 14)); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 27) , 17)); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 28) , 20)); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 29) , 23)); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 30) , 26)); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 31) , 29)); + _mm256_storeu_si256(compressed + 2, w0); +} + + +/* we are going to pack 256 4-bit values, touching 4 256-bit words, using 64 bytes */ +static void avxpackblock4(const uint32_t * pin, __m256i * compressed) { + const __m256i * in = (const __m256i *) pin; + /* we are going to touch 4 256-bit words */ + __m256i w0, w1; + w0 = _mm256_lddqu_si256 (in + 0); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 1) , 4)); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 2) , 8)); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 3) , 12)); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 4) , 16)); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 5) , 20)); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 6) , 24)); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 7) , 28)); + _mm256_storeu_si256(compressed + 0, w0); + w1 = _mm256_lddqu_si256 (in + 8); + w1 = _mm256_or_si256(w1,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 9) , 4)); + w1 = _mm256_or_si256(w1,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 10) , 8)); + w1 = _mm256_or_si256(w1,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 11) , 12)); + w1 = _mm256_or_si256(w1,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 12) , 16)); + w1 = _mm256_or_si256(w1,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 13) , 20)); + w1 = _mm256_or_si256(w1,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 14) , 24)); + w1 = _mm256_or_si256(w1,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 15) , 28)); + _mm256_storeu_si256(compressed + 1, w1); + w0 = _mm256_lddqu_si256 (in + 16); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 17) , 4)); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 18) , 8)); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 19) , 12)); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 20) , 16)); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 21) , 20)); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 22) , 24)); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 23) , 28)); + _mm256_storeu_si256(compressed + 2, w0); + w1 = _mm256_lddqu_si256 (in + 24); + w1 = _mm256_or_si256(w1,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 25) , 4)); + w1 = _mm256_or_si256(w1,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 26) , 8)); + w1 = _mm256_or_si256(w1,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 27) , 12)); + w1 = _mm256_or_si256(w1,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 28) , 16)); + w1 = _mm256_or_si256(w1,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 29) , 20)); + w1 = _mm256_or_si256(w1,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 30) , 24)); + w1 = _mm256_or_si256(w1,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 31) , 28)); + _mm256_storeu_si256(compressed + 3, w1); +} + + +/* we are going to pack 256 5-bit values, touching 5 256-bit words, using 80 bytes */ +static void avxpackblock5(const uint32_t * pin, __m256i * compressed) { + const __m256i * in = (const __m256i *) pin; + /* we are going to touch 5 256-bit words */ + __m256i w0, w1; + __m256i tmp; /* used to store inputs at word boundary */ + w0 = _mm256_lddqu_si256 (in + 0); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 1) , 5)); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 2) , 10)); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 3) , 15)); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 4) , 20)); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 5) , 25)); + tmp = _mm256_lddqu_si256 (in + 6); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 30)); + w1 = _mm256_srli_epi32(tmp,2); + _mm256_storeu_si256(compressed + 0, w0); + w1 = _mm256_or_si256(w1,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 7) , 3)); + w1 = _mm256_or_si256(w1,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 8) , 8)); + w1 = _mm256_or_si256(w1,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 9) , 13)); + w1 = _mm256_or_si256(w1,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 10) , 18)); + w1 = _mm256_or_si256(w1,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 11) , 23)); + tmp = _mm256_lddqu_si256 (in + 12); + w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 28)); + w0 = _mm256_srli_epi32(tmp,4); + _mm256_storeu_si256(compressed + 1, w1); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 13) , 1)); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 14) , 6)); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 15) , 11)); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 16) , 16)); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 17) , 21)); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 18) , 26)); + tmp = _mm256_lddqu_si256 (in + 19); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 31)); + w1 = _mm256_srli_epi32(tmp,1); + _mm256_storeu_si256(compressed + 2, w0); + w1 = _mm256_or_si256(w1,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 20) , 4)); + w1 = _mm256_or_si256(w1,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 21) , 9)); + w1 = _mm256_or_si256(w1,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 22) , 14)); + w1 = _mm256_or_si256(w1,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 23) , 19)); + w1 = _mm256_or_si256(w1,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 24) , 24)); + tmp = _mm256_lddqu_si256 (in + 25); + w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 29)); + w0 = _mm256_srli_epi32(tmp,3); + _mm256_storeu_si256(compressed + 3, w1); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 26) , 2)); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 27) , 7)); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 28) , 12)); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 29) , 17)); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 30) , 22)); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 31) , 27)); + _mm256_storeu_si256(compressed + 4, w0); +} + + +/* we are going to pack 256 6-bit values, touching 6 256-bit words, using 96 bytes */ +static void avxpackblock6(const uint32_t * pin, __m256i * compressed) { + const __m256i * in = (const __m256i *) pin; + /* we are going to touch 6 256-bit words */ + __m256i w0, w1; + __m256i tmp; /* used to store inputs at word boundary */ + w0 = _mm256_lddqu_si256 (in + 0); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 1) , 6)); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 2) , 12)); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 3) , 18)); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 4) , 24)); + tmp = _mm256_lddqu_si256 (in + 5); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 30)); + w1 = _mm256_srli_epi32(tmp,2); + _mm256_storeu_si256(compressed + 0, w0); + w1 = _mm256_or_si256(w1,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 6) , 4)); + w1 = _mm256_or_si256(w1,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 7) , 10)); + w1 = _mm256_or_si256(w1,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 8) , 16)); + w1 = _mm256_or_si256(w1,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 9) , 22)); + tmp = _mm256_lddqu_si256 (in + 10); + w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 28)); + w0 = _mm256_srli_epi32(tmp,4); + _mm256_storeu_si256(compressed + 1, w1); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 11) , 2)); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 12) , 8)); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 13) , 14)); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 14) , 20)); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 15) , 26)); + _mm256_storeu_si256(compressed + 2, w0); + w1 = _mm256_lddqu_si256 (in + 16); + w1 = _mm256_or_si256(w1,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 17) , 6)); + w1 = _mm256_or_si256(w1,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 18) , 12)); + w1 = _mm256_or_si256(w1,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 19) , 18)); + w1 = _mm256_or_si256(w1,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 20) , 24)); + tmp = _mm256_lddqu_si256 (in + 21); + w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 30)); + w0 = _mm256_srli_epi32(tmp,2); + _mm256_storeu_si256(compressed + 3, w1); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 22) , 4)); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 23) , 10)); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 24) , 16)); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 25) , 22)); + tmp = _mm256_lddqu_si256 (in + 26); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 28)); + w1 = _mm256_srli_epi32(tmp,4); + _mm256_storeu_si256(compressed + 4, w0); + w1 = _mm256_or_si256(w1,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 27) , 2)); + w1 = _mm256_or_si256(w1,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 28) , 8)); + w1 = _mm256_or_si256(w1,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 29) , 14)); + w1 = _mm256_or_si256(w1,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 30) , 20)); + w1 = _mm256_or_si256(w1,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 31) , 26)); + _mm256_storeu_si256(compressed + 5, w1); +} + + +/* we are going to pack 256 7-bit values, touching 7 256-bit words, using 112 bytes */ +static void avxpackblock7(const uint32_t * pin, __m256i * compressed) { + const __m256i * in = (const __m256i *) pin; + /* we are going to touch 7 256-bit words */ + __m256i w0, w1; + __m256i tmp; /* used to store inputs at word boundary */ + w0 = _mm256_lddqu_si256 (in + 0); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 1) , 7)); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 2) , 14)); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 3) , 21)); + tmp = _mm256_lddqu_si256 (in + 4); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 28)); + w1 = _mm256_srli_epi32(tmp,4); + _mm256_storeu_si256(compressed + 0, w0); + w1 = _mm256_or_si256(w1,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 5) , 3)); + w1 = _mm256_or_si256(w1,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 6) , 10)); + w1 = _mm256_or_si256(w1,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 7) , 17)); + w1 = _mm256_or_si256(w1,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 8) , 24)); + tmp = _mm256_lddqu_si256 (in + 9); + w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 31)); + w0 = _mm256_srli_epi32(tmp,1); + _mm256_storeu_si256(compressed + 1, w1); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 10) , 6)); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 11) , 13)); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 12) , 20)); + tmp = _mm256_lddqu_si256 (in + 13); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 27)); + w1 = _mm256_srli_epi32(tmp,5); + _mm256_storeu_si256(compressed + 2, w0); + w1 = _mm256_or_si256(w1,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 14) , 2)); + w1 = _mm256_or_si256(w1,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 15) , 9)); + w1 = _mm256_or_si256(w1,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 16) , 16)); + w1 = _mm256_or_si256(w1,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 17) , 23)); + tmp = _mm256_lddqu_si256 (in + 18); + w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 30)); + w0 = _mm256_srli_epi32(tmp,2); + _mm256_storeu_si256(compressed + 3, w1); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 19) , 5)); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 20) , 12)); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 21) , 19)); + tmp = _mm256_lddqu_si256 (in + 22); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 26)); + w1 = _mm256_srli_epi32(tmp,6); + _mm256_storeu_si256(compressed + 4, w0); + w1 = _mm256_or_si256(w1,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 23) , 1)); + w1 = _mm256_or_si256(w1,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 24) , 8)); + w1 = _mm256_or_si256(w1,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 25) , 15)); + w1 = _mm256_or_si256(w1,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 26) , 22)); + tmp = _mm256_lddqu_si256 (in + 27); + w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 29)); + w0 = _mm256_srli_epi32(tmp,3); + _mm256_storeu_si256(compressed + 5, w1); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 28) , 4)); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 29) , 11)); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 30) , 18)); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 31) , 25)); + _mm256_storeu_si256(compressed + 6, w0); +} + + +/* we are going to pack 256 8-bit values, touching 8 256-bit words, using 128 bytes */ +static void avxpackblock8(const uint32_t * pin, __m256i * compressed) { + const __m256i * in = (const __m256i *) pin; + /* we are going to touch 8 256-bit words */ + __m256i w0, w1; + w0 = _mm256_lddqu_si256 (in + 0); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 1) , 8)); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 2) , 16)); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 3) , 24)); + _mm256_storeu_si256(compressed + 0, w0); + w1 = _mm256_lddqu_si256 (in + 4); + w1 = _mm256_or_si256(w1,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 5) , 8)); + w1 = _mm256_or_si256(w1,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 6) , 16)); + w1 = _mm256_or_si256(w1,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 7) , 24)); + _mm256_storeu_si256(compressed + 1, w1); + w0 = _mm256_lddqu_si256 (in + 8); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 9) , 8)); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 10) , 16)); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 11) , 24)); + _mm256_storeu_si256(compressed + 2, w0); + w1 = _mm256_lddqu_si256 (in + 12); + w1 = _mm256_or_si256(w1,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 13) , 8)); + w1 = _mm256_or_si256(w1,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 14) , 16)); + w1 = _mm256_or_si256(w1,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 15) , 24)); + _mm256_storeu_si256(compressed + 3, w1); + w0 = _mm256_lddqu_si256 (in + 16); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 17) , 8)); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 18) , 16)); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 19) , 24)); + _mm256_storeu_si256(compressed + 4, w0); + w1 = _mm256_lddqu_si256 (in + 20); + w1 = _mm256_or_si256(w1,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 21) , 8)); + w1 = _mm256_or_si256(w1,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 22) , 16)); + w1 = _mm256_or_si256(w1,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 23) , 24)); + _mm256_storeu_si256(compressed + 5, w1); + w0 = _mm256_lddqu_si256 (in + 24); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 25) , 8)); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 26) , 16)); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 27) , 24)); + _mm256_storeu_si256(compressed + 6, w0); + w1 = _mm256_lddqu_si256 (in + 28); + w1 = _mm256_or_si256(w1,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 29) , 8)); + w1 = _mm256_or_si256(w1,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 30) , 16)); + w1 = _mm256_or_si256(w1,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 31) , 24)); + _mm256_storeu_si256(compressed + 7, w1); +} + + +/* we are going to pack 256 9-bit values, touching 9 256-bit words, using 144 bytes */ +static void avxpackblock9(const uint32_t * pin, __m256i * compressed) { + const __m256i * in = (const __m256i *) pin; + /* we are going to touch 9 256-bit words */ + __m256i w0, w1; + __m256i tmp; /* used to store inputs at word boundary */ + w0 = _mm256_lddqu_si256 (in + 0); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 1) , 9)); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 2) , 18)); + tmp = _mm256_lddqu_si256 (in + 3); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 27)); + w1 = _mm256_srli_epi32(tmp,5); + _mm256_storeu_si256(compressed + 0, w0); + w1 = _mm256_or_si256(w1,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 4) , 4)); + w1 = _mm256_or_si256(w1,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 5) , 13)); + w1 = _mm256_or_si256(w1,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 6) , 22)); + tmp = _mm256_lddqu_si256 (in + 7); + w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 31)); + w0 = _mm256_srli_epi32(tmp,1); + _mm256_storeu_si256(compressed + 1, w1); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 8) , 8)); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 9) , 17)); + tmp = _mm256_lddqu_si256 (in + 10); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 26)); + w1 = _mm256_srli_epi32(tmp,6); + _mm256_storeu_si256(compressed + 2, w0); + w1 = _mm256_or_si256(w1,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 11) , 3)); + w1 = _mm256_or_si256(w1,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 12) , 12)); + w1 = _mm256_or_si256(w1,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 13) , 21)); + tmp = _mm256_lddqu_si256 (in + 14); + w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 30)); + w0 = _mm256_srli_epi32(tmp,2); + _mm256_storeu_si256(compressed + 3, w1); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 15) , 7)); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 16) , 16)); + tmp = _mm256_lddqu_si256 (in + 17); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 25)); + w1 = _mm256_srli_epi32(tmp,7); + _mm256_storeu_si256(compressed + 4, w0); + w1 = _mm256_or_si256(w1,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 18) , 2)); + w1 = _mm256_or_si256(w1,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 19) , 11)); + w1 = _mm256_or_si256(w1,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 20) , 20)); + tmp = _mm256_lddqu_si256 (in + 21); + w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 29)); + w0 = _mm256_srli_epi32(tmp,3); + _mm256_storeu_si256(compressed + 5, w1); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 22) , 6)); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 23) , 15)); + tmp = _mm256_lddqu_si256 (in + 24); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 24)); + w1 = _mm256_srli_epi32(tmp,8); + _mm256_storeu_si256(compressed + 6, w0); + w1 = _mm256_or_si256(w1,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 25) , 1)); + w1 = _mm256_or_si256(w1,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 26) , 10)); + w1 = _mm256_or_si256(w1,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 27) , 19)); + tmp = _mm256_lddqu_si256 (in + 28); + w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 28)); + w0 = _mm256_srli_epi32(tmp,4); + _mm256_storeu_si256(compressed + 7, w1); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 29) , 5)); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 30) , 14)); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 31) , 23)); + _mm256_storeu_si256(compressed + 8, w0); +} + + +/* we are going to pack 256 10-bit values, touching 10 256-bit words, using 160 bytes */ +static void avxpackblock10(const uint32_t * pin, __m256i * compressed) { + const __m256i * in = (const __m256i *) pin; + /* we are going to touch 10 256-bit words */ + __m256i w0, w1; + __m256i tmp; /* used to store inputs at word boundary */ + w0 = _mm256_lddqu_si256 (in + 0); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 1) , 10)); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 2) , 20)); + tmp = _mm256_lddqu_si256 (in + 3); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 30)); + w1 = _mm256_srli_epi32(tmp,2); + _mm256_storeu_si256(compressed + 0, w0); + w1 = _mm256_or_si256(w1,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 4) , 8)); + w1 = _mm256_or_si256(w1,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 5) , 18)); + tmp = _mm256_lddqu_si256 (in + 6); + w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 28)); + w0 = _mm256_srli_epi32(tmp,4); + _mm256_storeu_si256(compressed + 1, w1); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 7) , 6)); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 8) , 16)); + tmp = _mm256_lddqu_si256 (in + 9); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 26)); + w1 = _mm256_srli_epi32(tmp,6); + _mm256_storeu_si256(compressed + 2, w0); + w1 = _mm256_or_si256(w1,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 10) , 4)); + w1 = _mm256_or_si256(w1,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 11) , 14)); + tmp = _mm256_lddqu_si256 (in + 12); + w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 24)); + w0 = _mm256_srli_epi32(tmp,8); + _mm256_storeu_si256(compressed + 3, w1); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 13) , 2)); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 14) , 12)); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 15) , 22)); + _mm256_storeu_si256(compressed + 4, w0); + w1 = _mm256_lddqu_si256 (in + 16); + w1 = _mm256_or_si256(w1,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 17) , 10)); + w1 = _mm256_or_si256(w1,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 18) , 20)); + tmp = _mm256_lddqu_si256 (in + 19); + w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 30)); + w0 = _mm256_srli_epi32(tmp,2); + _mm256_storeu_si256(compressed + 5, w1); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 20) , 8)); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 21) , 18)); + tmp = _mm256_lddqu_si256 (in + 22); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 28)); + w1 = _mm256_srli_epi32(tmp,4); + _mm256_storeu_si256(compressed + 6, w0); + w1 = _mm256_or_si256(w1,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 23) , 6)); + w1 = _mm256_or_si256(w1,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 24) , 16)); + tmp = _mm256_lddqu_si256 (in + 25); + w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 26)); + w0 = _mm256_srli_epi32(tmp,6); + _mm256_storeu_si256(compressed + 7, w1); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 26) , 4)); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 27) , 14)); + tmp = _mm256_lddqu_si256 (in + 28); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 24)); + w1 = _mm256_srli_epi32(tmp,8); + _mm256_storeu_si256(compressed + 8, w0); + w1 = _mm256_or_si256(w1,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 29) , 2)); + w1 = _mm256_or_si256(w1,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 30) , 12)); + w1 = _mm256_or_si256(w1,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 31) , 22)); + _mm256_storeu_si256(compressed + 9, w1); +} + + +/* we are going to pack 256 11-bit values, touching 11 256-bit words, using 176 bytes */ +static void avxpackblock11(const uint32_t * pin, __m256i * compressed) { + const __m256i * in = (const __m256i *) pin; + /* we are going to touch 11 256-bit words */ + __m256i w0, w1; + __m256i tmp; /* used to store inputs at word boundary */ + w0 = _mm256_lddqu_si256 (in + 0); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 1) , 11)); + tmp = _mm256_lddqu_si256 (in + 2); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 22)); + w1 = _mm256_srli_epi32(tmp,10); + _mm256_storeu_si256(compressed + 0, w0); + w1 = _mm256_or_si256(w1,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 3) , 1)); + w1 = _mm256_or_si256(w1,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 4) , 12)); + tmp = _mm256_lddqu_si256 (in + 5); + w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 23)); + w0 = _mm256_srli_epi32(tmp,9); + _mm256_storeu_si256(compressed + 1, w1); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 6) , 2)); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 7) , 13)); + tmp = _mm256_lddqu_si256 (in + 8); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 24)); + w1 = _mm256_srli_epi32(tmp,8); + _mm256_storeu_si256(compressed + 2, w0); + w1 = _mm256_or_si256(w1,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 9) , 3)); + w1 = _mm256_or_si256(w1,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 10) , 14)); + tmp = _mm256_lddqu_si256 (in + 11); + w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 25)); + w0 = _mm256_srli_epi32(tmp,7); + _mm256_storeu_si256(compressed + 3, w1); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 12) , 4)); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 13) , 15)); + tmp = _mm256_lddqu_si256 (in + 14); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 26)); + w1 = _mm256_srli_epi32(tmp,6); + _mm256_storeu_si256(compressed + 4, w0); + w1 = _mm256_or_si256(w1,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 15) , 5)); + w1 = _mm256_or_si256(w1,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 16) , 16)); + tmp = _mm256_lddqu_si256 (in + 17); + w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 27)); + w0 = _mm256_srli_epi32(tmp,5); + _mm256_storeu_si256(compressed + 5, w1); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 18) , 6)); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 19) , 17)); + tmp = _mm256_lddqu_si256 (in + 20); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 28)); + w1 = _mm256_srli_epi32(tmp,4); + _mm256_storeu_si256(compressed + 6, w0); + w1 = _mm256_or_si256(w1,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 21) , 7)); + w1 = _mm256_or_si256(w1,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 22) , 18)); + tmp = _mm256_lddqu_si256 (in + 23); + w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 29)); + w0 = _mm256_srli_epi32(tmp,3); + _mm256_storeu_si256(compressed + 7, w1); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 24) , 8)); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 25) , 19)); + tmp = _mm256_lddqu_si256 (in + 26); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 30)); + w1 = _mm256_srli_epi32(tmp,2); + _mm256_storeu_si256(compressed + 8, w0); + w1 = _mm256_or_si256(w1,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 27) , 9)); + w1 = _mm256_or_si256(w1,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 28) , 20)); + tmp = _mm256_lddqu_si256 (in + 29); + w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 31)); + w0 = _mm256_srli_epi32(tmp,1); + _mm256_storeu_si256(compressed + 9, w1); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 30) , 10)); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 31) , 21)); + _mm256_storeu_si256(compressed + 10, w0); +} + + +/* we are going to pack 256 12-bit values, touching 12 256-bit words, using 192 bytes */ +static void avxpackblock12(const uint32_t * pin, __m256i * compressed) { + const __m256i * in = (const __m256i *) pin; + /* we are going to touch 12 256-bit words */ + __m256i w0, w1; + __m256i tmp; /* used to store inputs at word boundary */ + w0 = _mm256_lddqu_si256 (in + 0); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 1) , 12)); + tmp = _mm256_lddqu_si256 (in + 2); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 24)); + w1 = _mm256_srli_epi32(tmp,8); + _mm256_storeu_si256(compressed + 0, w0); + w1 = _mm256_or_si256(w1,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 3) , 4)); + w1 = _mm256_or_si256(w1,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 4) , 16)); + tmp = _mm256_lddqu_si256 (in + 5); + w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 28)); + w0 = _mm256_srli_epi32(tmp,4); + _mm256_storeu_si256(compressed + 1, w1); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 6) , 8)); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 7) , 20)); + _mm256_storeu_si256(compressed + 2, w0); + w1 = _mm256_lddqu_si256 (in + 8); + w1 = _mm256_or_si256(w1,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 9) , 12)); + tmp = _mm256_lddqu_si256 (in + 10); + w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 24)); + w0 = _mm256_srli_epi32(tmp,8); + _mm256_storeu_si256(compressed + 3, w1); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 11) , 4)); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 12) , 16)); + tmp = _mm256_lddqu_si256 (in + 13); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 28)); + w1 = _mm256_srli_epi32(tmp,4); + _mm256_storeu_si256(compressed + 4, w0); + w1 = _mm256_or_si256(w1,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 14) , 8)); + w1 = _mm256_or_si256(w1,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 15) , 20)); + _mm256_storeu_si256(compressed + 5, w1); + w0 = _mm256_lddqu_si256 (in + 16); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 17) , 12)); + tmp = _mm256_lddqu_si256 (in + 18); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 24)); + w1 = _mm256_srli_epi32(tmp,8); + _mm256_storeu_si256(compressed + 6, w0); + w1 = _mm256_or_si256(w1,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 19) , 4)); + w1 = _mm256_or_si256(w1,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 20) , 16)); + tmp = _mm256_lddqu_si256 (in + 21); + w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 28)); + w0 = _mm256_srli_epi32(tmp,4); + _mm256_storeu_si256(compressed + 7, w1); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 22) , 8)); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 23) , 20)); + _mm256_storeu_si256(compressed + 8, w0); + w1 = _mm256_lddqu_si256 (in + 24); + w1 = _mm256_or_si256(w1,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 25) , 12)); + tmp = _mm256_lddqu_si256 (in + 26); + w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 24)); + w0 = _mm256_srli_epi32(tmp,8); + _mm256_storeu_si256(compressed + 9, w1); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 27) , 4)); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 28) , 16)); + tmp = _mm256_lddqu_si256 (in + 29); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 28)); + w1 = _mm256_srli_epi32(tmp,4); + _mm256_storeu_si256(compressed + 10, w0); + w1 = _mm256_or_si256(w1,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 30) , 8)); + w1 = _mm256_or_si256(w1,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 31) , 20)); + _mm256_storeu_si256(compressed + 11, w1); +} + + +/* we are going to pack 256 13-bit values, touching 13 256-bit words, using 208 bytes */ +static void avxpackblock13(const uint32_t * pin, __m256i * compressed) { + const __m256i * in = (const __m256i *) pin; + /* we are going to touch 13 256-bit words */ + __m256i w0, w1; + __m256i tmp; /* used to store inputs at word boundary */ + w0 = _mm256_lddqu_si256 (in + 0); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 1) , 13)); + tmp = _mm256_lddqu_si256 (in + 2); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 26)); + w1 = _mm256_srli_epi32(tmp,6); + _mm256_storeu_si256(compressed + 0, w0); + w1 = _mm256_or_si256(w1,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 3) , 7)); + tmp = _mm256_lddqu_si256 (in + 4); + w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 20)); + w0 = _mm256_srli_epi32(tmp,12); + _mm256_storeu_si256(compressed + 1, w1); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 5) , 1)); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 6) , 14)); + tmp = _mm256_lddqu_si256 (in + 7); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 27)); + w1 = _mm256_srli_epi32(tmp,5); + _mm256_storeu_si256(compressed + 2, w0); + w1 = _mm256_or_si256(w1,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 8) , 8)); + tmp = _mm256_lddqu_si256 (in + 9); + w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 21)); + w0 = _mm256_srli_epi32(tmp,11); + _mm256_storeu_si256(compressed + 3, w1); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 10) , 2)); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 11) , 15)); + tmp = _mm256_lddqu_si256 (in + 12); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 28)); + w1 = _mm256_srli_epi32(tmp,4); + _mm256_storeu_si256(compressed + 4, w0); + w1 = _mm256_or_si256(w1,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 13) , 9)); + tmp = _mm256_lddqu_si256 (in + 14); + w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 22)); + w0 = _mm256_srli_epi32(tmp,10); + _mm256_storeu_si256(compressed + 5, w1); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 15) , 3)); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 16) , 16)); + tmp = _mm256_lddqu_si256 (in + 17); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 29)); + w1 = _mm256_srli_epi32(tmp,3); + _mm256_storeu_si256(compressed + 6, w0); + w1 = _mm256_or_si256(w1,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 18) , 10)); + tmp = _mm256_lddqu_si256 (in + 19); + w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 23)); + w0 = _mm256_srli_epi32(tmp,9); + _mm256_storeu_si256(compressed + 7, w1); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 20) , 4)); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 21) , 17)); + tmp = _mm256_lddqu_si256 (in + 22); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 30)); + w1 = _mm256_srli_epi32(tmp,2); + _mm256_storeu_si256(compressed + 8, w0); + w1 = _mm256_or_si256(w1,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 23) , 11)); + tmp = _mm256_lddqu_si256 (in + 24); + w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 24)); + w0 = _mm256_srli_epi32(tmp,8); + _mm256_storeu_si256(compressed + 9, w1); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 25) , 5)); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 26) , 18)); + tmp = _mm256_lddqu_si256 (in + 27); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 31)); + w1 = _mm256_srli_epi32(tmp,1); + _mm256_storeu_si256(compressed + 10, w0); + w1 = _mm256_or_si256(w1,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 28) , 12)); + tmp = _mm256_lddqu_si256 (in + 29); + w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 25)); + w0 = _mm256_srli_epi32(tmp,7); + _mm256_storeu_si256(compressed + 11, w1); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 30) , 6)); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 31) , 19)); + _mm256_storeu_si256(compressed + 12, w0); +} + + +/* we are going to pack 256 14-bit values, touching 14 256-bit words, using 224 bytes */ +static void avxpackblock14(const uint32_t * pin, __m256i * compressed) { + const __m256i * in = (const __m256i *) pin; + /* we are going to touch 14 256-bit words */ + __m256i w0, w1; + __m256i tmp; /* used to store inputs at word boundary */ + w0 = _mm256_lddqu_si256 (in + 0); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 1) , 14)); + tmp = _mm256_lddqu_si256 (in + 2); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 28)); + w1 = _mm256_srli_epi32(tmp,4); + _mm256_storeu_si256(compressed + 0, w0); + w1 = _mm256_or_si256(w1,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 3) , 10)); + tmp = _mm256_lddqu_si256 (in + 4); + w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 24)); + w0 = _mm256_srli_epi32(tmp,8); + _mm256_storeu_si256(compressed + 1, w1); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 5) , 6)); + tmp = _mm256_lddqu_si256 (in + 6); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 20)); + w1 = _mm256_srli_epi32(tmp,12); + _mm256_storeu_si256(compressed + 2, w0); + w1 = _mm256_or_si256(w1,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 7) , 2)); + w1 = _mm256_or_si256(w1,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 8) , 16)); + tmp = _mm256_lddqu_si256 (in + 9); + w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 30)); + w0 = _mm256_srli_epi32(tmp,2); + _mm256_storeu_si256(compressed + 3, w1); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 10) , 12)); + tmp = _mm256_lddqu_si256 (in + 11); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 26)); + w1 = _mm256_srli_epi32(tmp,6); + _mm256_storeu_si256(compressed + 4, w0); + w1 = _mm256_or_si256(w1,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 12) , 8)); + tmp = _mm256_lddqu_si256 (in + 13); + w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 22)); + w0 = _mm256_srli_epi32(tmp,10); + _mm256_storeu_si256(compressed + 5, w1); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 14) , 4)); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 15) , 18)); + _mm256_storeu_si256(compressed + 6, w0); + w1 = _mm256_lddqu_si256 (in + 16); + w1 = _mm256_or_si256(w1,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 17) , 14)); + tmp = _mm256_lddqu_si256 (in + 18); + w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 28)); + w0 = _mm256_srli_epi32(tmp,4); + _mm256_storeu_si256(compressed + 7, w1); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 19) , 10)); + tmp = _mm256_lddqu_si256 (in + 20); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 24)); + w1 = _mm256_srli_epi32(tmp,8); + _mm256_storeu_si256(compressed + 8, w0); + w1 = _mm256_or_si256(w1,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 21) , 6)); + tmp = _mm256_lddqu_si256 (in + 22); + w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 20)); + w0 = _mm256_srli_epi32(tmp,12); + _mm256_storeu_si256(compressed + 9, w1); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 23) , 2)); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 24) , 16)); + tmp = _mm256_lddqu_si256 (in + 25); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 30)); + w1 = _mm256_srli_epi32(tmp,2); + _mm256_storeu_si256(compressed + 10, w0); + w1 = _mm256_or_si256(w1,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 26) , 12)); + tmp = _mm256_lddqu_si256 (in + 27); + w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 26)); + w0 = _mm256_srli_epi32(tmp,6); + _mm256_storeu_si256(compressed + 11, w1); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 28) , 8)); + tmp = _mm256_lddqu_si256 (in + 29); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 22)); + w1 = _mm256_srli_epi32(tmp,10); + _mm256_storeu_si256(compressed + 12, w0); + w1 = _mm256_or_si256(w1,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 30) , 4)); + w1 = _mm256_or_si256(w1,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 31) , 18)); + _mm256_storeu_si256(compressed + 13, w1); +} + + +/* we are going to pack 256 15-bit values, touching 15 256-bit words, using 240 bytes */ +static void avxpackblock15(const uint32_t * pin, __m256i * compressed) { + const __m256i * in = (const __m256i *) pin; + /* we are going to touch 15 256-bit words */ + __m256i w0, w1; + __m256i tmp; /* used to store inputs at word boundary */ + w0 = _mm256_lddqu_si256 (in + 0); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 1) , 15)); + tmp = _mm256_lddqu_si256 (in + 2); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 30)); + w1 = _mm256_srli_epi32(tmp,2); + _mm256_storeu_si256(compressed + 0, w0); + w1 = _mm256_or_si256(w1,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 3) , 13)); + tmp = _mm256_lddqu_si256 (in + 4); + w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 28)); + w0 = _mm256_srli_epi32(tmp,4); + _mm256_storeu_si256(compressed + 1, w1); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 5) , 11)); + tmp = _mm256_lddqu_si256 (in + 6); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 26)); + w1 = _mm256_srli_epi32(tmp,6); + _mm256_storeu_si256(compressed + 2, w0); + w1 = _mm256_or_si256(w1,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 7) , 9)); + tmp = _mm256_lddqu_si256 (in + 8); + w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 24)); + w0 = _mm256_srli_epi32(tmp,8); + _mm256_storeu_si256(compressed + 3, w1); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 9) , 7)); + tmp = _mm256_lddqu_si256 (in + 10); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 22)); + w1 = _mm256_srli_epi32(tmp,10); + _mm256_storeu_si256(compressed + 4, w0); + w1 = _mm256_or_si256(w1,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 11) , 5)); + tmp = _mm256_lddqu_si256 (in + 12); + w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 20)); + w0 = _mm256_srli_epi32(tmp,12); + _mm256_storeu_si256(compressed + 5, w1); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 13) , 3)); + tmp = _mm256_lddqu_si256 (in + 14); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 18)); + w1 = _mm256_srli_epi32(tmp,14); + _mm256_storeu_si256(compressed + 6, w0); + w1 = _mm256_or_si256(w1,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 15) , 1)); + w1 = _mm256_or_si256(w1,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 16) , 16)); + tmp = _mm256_lddqu_si256 (in + 17); + w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 31)); + w0 = _mm256_srli_epi32(tmp,1); + _mm256_storeu_si256(compressed + 7, w1); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 18) , 14)); + tmp = _mm256_lddqu_si256 (in + 19); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 29)); + w1 = _mm256_srli_epi32(tmp,3); + _mm256_storeu_si256(compressed + 8, w0); + w1 = _mm256_or_si256(w1,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 20) , 12)); + tmp = _mm256_lddqu_si256 (in + 21); + w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 27)); + w0 = _mm256_srli_epi32(tmp,5); + _mm256_storeu_si256(compressed + 9, w1); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 22) , 10)); + tmp = _mm256_lddqu_si256 (in + 23); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 25)); + w1 = _mm256_srli_epi32(tmp,7); + _mm256_storeu_si256(compressed + 10, w0); + w1 = _mm256_or_si256(w1,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 24) , 8)); + tmp = _mm256_lddqu_si256 (in + 25); + w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 23)); + w0 = _mm256_srli_epi32(tmp,9); + _mm256_storeu_si256(compressed + 11, w1); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 26) , 6)); + tmp = _mm256_lddqu_si256 (in + 27); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 21)); + w1 = _mm256_srli_epi32(tmp,11); + _mm256_storeu_si256(compressed + 12, w0); + w1 = _mm256_or_si256(w1,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 28) , 4)); + tmp = _mm256_lddqu_si256 (in + 29); + w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 19)); + w0 = _mm256_srli_epi32(tmp,13); + _mm256_storeu_si256(compressed + 13, w1); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 30) , 2)); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 31) , 17)); + _mm256_storeu_si256(compressed + 14, w0); +} + + +/* we are going to pack 256 16-bit values, touching 16 256-bit words, using 256 bytes */ +static void avxpackblock16(const uint32_t * pin, __m256i * compressed) { + const __m256i * in = (const __m256i *) pin; + /* we are going to touch 16 256-bit words */ + __m256i w0, w1; + w0 = _mm256_lddqu_si256 (in + 0); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 1) , 16)); + _mm256_storeu_si256(compressed + 0, w0); + w1 = _mm256_lddqu_si256 (in + 2); + w1 = _mm256_or_si256(w1,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 3) , 16)); + _mm256_storeu_si256(compressed + 1, w1); + w0 = _mm256_lddqu_si256 (in + 4); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 5) , 16)); + _mm256_storeu_si256(compressed + 2, w0); + w1 = _mm256_lddqu_si256 (in + 6); + w1 = _mm256_or_si256(w1,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 7) , 16)); + _mm256_storeu_si256(compressed + 3, w1); + w0 = _mm256_lddqu_si256 (in + 8); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 9) , 16)); + _mm256_storeu_si256(compressed + 4, w0); + w1 = _mm256_lddqu_si256 (in + 10); + w1 = _mm256_or_si256(w1,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 11) , 16)); + _mm256_storeu_si256(compressed + 5, w1); + w0 = _mm256_lddqu_si256 (in + 12); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 13) , 16)); + _mm256_storeu_si256(compressed + 6, w0); + w1 = _mm256_lddqu_si256 (in + 14); + w1 = _mm256_or_si256(w1,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 15) , 16)); + _mm256_storeu_si256(compressed + 7, w1); + w0 = _mm256_lddqu_si256 (in + 16); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 17) , 16)); + _mm256_storeu_si256(compressed + 8, w0); + w1 = _mm256_lddqu_si256 (in + 18); + w1 = _mm256_or_si256(w1,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 19) , 16)); + _mm256_storeu_si256(compressed + 9, w1); + w0 = _mm256_lddqu_si256 (in + 20); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 21) , 16)); + _mm256_storeu_si256(compressed + 10, w0); + w1 = _mm256_lddqu_si256 (in + 22); + w1 = _mm256_or_si256(w1,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 23) , 16)); + _mm256_storeu_si256(compressed + 11, w1); + w0 = _mm256_lddqu_si256 (in + 24); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 25) , 16)); + _mm256_storeu_si256(compressed + 12, w0); + w1 = _mm256_lddqu_si256 (in + 26); + w1 = _mm256_or_si256(w1,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 27) , 16)); + _mm256_storeu_si256(compressed + 13, w1); + w0 = _mm256_lddqu_si256 (in + 28); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 29) , 16)); + _mm256_storeu_si256(compressed + 14, w0); + w1 = _mm256_lddqu_si256 (in + 30); + w1 = _mm256_or_si256(w1,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 31) , 16)); + _mm256_storeu_si256(compressed + 15, w1); +} + + +/* we are going to pack 256 17-bit values, touching 17 256-bit words, using 272 bytes */ +static void avxpackblock17(const uint32_t * pin, __m256i * compressed) { + const __m256i * in = (const __m256i *) pin; + /* we are going to touch 17 256-bit words */ + __m256i w0, w1; + __m256i tmp; /* used to store inputs at word boundary */ + w0 = _mm256_lddqu_si256 (in + 0); + tmp = _mm256_lddqu_si256 (in + 1); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 17)); + w1 = _mm256_srli_epi32(tmp,15); + _mm256_storeu_si256(compressed + 0, w0); + w1 = _mm256_or_si256(w1,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 2) , 2)); + tmp = _mm256_lddqu_si256 (in + 3); + w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 19)); + w0 = _mm256_srli_epi32(tmp,13); + _mm256_storeu_si256(compressed + 1, w1); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 4) , 4)); + tmp = _mm256_lddqu_si256 (in + 5); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 21)); + w1 = _mm256_srli_epi32(tmp,11); + _mm256_storeu_si256(compressed + 2, w0); + w1 = _mm256_or_si256(w1,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 6) , 6)); + tmp = _mm256_lddqu_si256 (in + 7); + w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 23)); + w0 = _mm256_srli_epi32(tmp,9); + _mm256_storeu_si256(compressed + 3, w1); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 8) , 8)); + tmp = _mm256_lddqu_si256 (in + 9); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 25)); + w1 = _mm256_srli_epi32(tmp,7); + _mm256_storeu_si256(compressed + 4, w0); + w1 = _mm256_or_si256(w1,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 10) , 10)); + tmp = _mm256_lddqu_si256 (in + 11); + w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 27)); + w0 = _mm256_srli_epi32(tmp,5); + _mm256_storeu_si256(compressed + 5, w1); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 12) , 12)); + tmp = _mm256_lddqu_si256 (in + 13); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 29)); + w1 = _mm256_srli_epi32(tmp,3); + _mm256_storeu_si256(compressed + 6, w0); + w1 = _mm256_or_si256(w1,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 14) , 14)); + tmp = _mm256_lddqu_si256 (in + 15); + w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 31)); + w0 = _mm256_srli_epi32(tmp,1); + _mm256_storeu_si256(compressed + 7, w1); + tmp = _mm256_lddqu_si256 (in + 16); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 16)); + w1 = _mm256_srli_epi32(tmp,16); + _mm256_storeu_si256(compressed + 8, w0); + w1 = _mm256_or_si256(w1,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 17) , 1)); + tmp = _mm256_lddqu_si256 (in + 18); + w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 18)); + w0 = _mm256_srli_epi32(tmp,14); + _mm256_storeu_si256(compressed + 9, w1); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 19) , 3)); + tmp = _mm256_lddqu_si256 (in + 20); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 20)); + w1 = _mm256_srli_epi32(tmp,12); + _mm256_storeu_si256(compressed + 10, w0); + w1 = _mm256_or_si256(w1,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 21) , 5)); + tmp = _mm256_lddqu_si256 (in + 22); + w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 22)); + w0 = _mm256_srli_epi32(tmp,10); + _mm256_storeu_si256(compressed + 11, w1); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 23) , 7)); + tmp = _mm256_lddqu_si256 (in + 24); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 24)); + w1 = _mm256_srli_epi32(tmp,8); + _mm256_storeu_si256(compressed + 12, w0); + w1 = _mm256_or_si256(w1,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 25) , 9)); + tmp = _mm256_lddqu_si256 (in + 26); + w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 26)); + w0 = _mm256_srli_epi32(tmp,6); + _mm256_storeu_si256(compressed + 13, w1); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 27) , 11)); + tmp = _mm256_lddqu_si256 (in + 28); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 28)); + w1 = _mm256_srli_epi32(tmp,4); + _mm256_storeu_si256(compressed + 14, w0); + w1 = _mm256_or_si256(w1,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 29) , 13)); + tmp = _mm256_lddqu_si256 (in + 30); + w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 30)); + w0 = _mm256_srli_epi32(tmp,2); + _mm256_storeu_si256(compressed + 15, w1); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 31) , 15)); + _mm256_storeu_si256(compressed + 16, w0); +} + + +/* we are going to pack 256 18-bit values, touching 18 256-bit words, using 288 bytes */ +static void avxpackblock18(const uint32_t * pin, __m256i * compressed) { + const __m256i * in = (const __m256i *) pin; + /* we are going to touch 18 256-bit words */ + __m256i w0, w1; + __m256i tmp; /* used to store inputs at word boundary */ + w0 = _mm256_lddqu_si256 (in + 0); + tmp = _mm256_lddqu_si256 (in + 1); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 18)); + w1 = _mm256_srli_epi32(tmp,14); + _mm256_storeu_si256(compressed + 0, w0); + w1 = _mm256_or_si256(w1,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 2) , 4)); + tmp = _mm256_lddqu_si256 (in + 3); + w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 22)); + w0 = _mm256_srli_epi32(tmp,10); + _mm256_storeu_si256(compressed + 1, w1); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 4) , 8)); + tmp = _mm256_lddqu_si256 (in + 5); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 26)); + w1 = _mm256_srli_epi32(tmp,6); + _mm256_storeu_si256(compressed + 2, w0); + w1 = _mm256_or_si256(w1,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 6) , 12)); + tmp = _mm256_lddqu_si256 (in + 7); + w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 30)); + w0 = _mm256_srli_epi32(tmp,2); + _mm256_storeu_si256(compressed + 3, w1); + tmp = _mm256_lddqu_si256 (in + 8); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 16)); + w1 = _mm256_srli_epi32(tmp,16); + _mm256_storeu_si256(compressed + 4, w0); + w1 = _mm256_or_si256(w1,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 9) , 2)); + tmp = _mm256_lddqu_si256 (in + 10); + w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 20)); + w0 = _mm256_srli_epi32(tmp,12); + _mm256_storeu_si256(compressed + 5, w1); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 11) , 6)); + tmp = _mm256_lddqu_si256 (in + 12); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 24)); + w1 = _mm256_srli_epi32(tmp,8); + _mm256_storeu_si256(compressed + 6, w0); + w1 = _mm256_or_si256(w1,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 13) , 10)); + tmp = _mm256_lddqu_si256 (in + 14); + w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 28)); + w0 = _mm256_srli_epi32(tmp,4); + _mm256_storeu_si256(compressed + 7, w1); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 15) , 14)); + _mm256_storeu_si256(compressed + 8, w0); + w1 = _mm256_lddqu_si256 (in + 16); + tmp = _mm256_lddqu_si256 (in + 17); + w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 18)); + w0 = _mm256_srli_epi32(tmp,14); + _mm256_storeu_si256(compressed + 9, w1); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 18) , 4)); + tmp = _mm256_lddqu_si256 (in + 19); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 22)); + w1 = _mm256_srli_epi32(tmp,10); + _mm256_storeu_si256(compressed + 10, w0); + w1 = _mm256_or_si256(w1,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 20) , 8)); + tmp = _mm256_lddqu_si256 (in + 21); + w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 26)); + w0 = _mm256_srli_epi32(tmp,6); + _mm256_storeu_si256(compressed + 11, w1); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 22) , 12)); + tmp = _mm256_lddqu_si256 (in + 23); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 30)); + w1 = _mm256_srli_epi32(tmp,2); + _mm256_storeu_si256(compressed + 12, w0); + tmp = _mm256_lddqu_si256 (in + 24); + w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 16)); + w0 = _mm256_srli_epi32(tmp,16); + _mm256_storeu_si256(compressed + 13, w1); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 25) , 2)); + tmp = _mm256_lddqu_si256 (in + 26); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 20)); + w1 = _mm256_srli_epi32(tmp,12); + _mm256_storeu_si256(compressed + 14, w0); + w1 = _mm256_or_si256(w1,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 27) , 6)); + tmp = _mm256_lddqu_si256 (in + 28); + w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 24)); + w0 = _mm256_srli_epi32(tmp,8); + _mm256_storeu_si256(compressed + 15, w1); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 29) , 10)); + tmp = _mm256_lddqu_si256 (in + 30); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 28)); + w1 = _mm256_srli_epi32(tmp,4); + _mm256_storeu_si256(compressed + 16, w0); + w1 = _mm256_or_si256(w1,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 31) , 14)); + _mm256_storeu_si256(compressed + 17, w1); +} + + +/* we are going to pack 256 19-bit values, touching 19 256-bit words, using 304 bytes */ +static void avxpackblock19(const uint32_t * pin, __m256i * compressed) { + const __m256i * in = (const __m256i *) pin; + /* we are going to touch 19 256-bit words */ + __m256i w0, w1; + __m256i tmp; /* used to store inputs at word boundary */ + w0 = _mm256_lddqu_si256 (in + 0); + tmp = _mm256_lddqu_si256 (in + 1); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 19)); + w1 = _mm256_srli_epi32(tmp,13); + _mm256_storeu_si256(compressed + 0, w0); + w1 = _mm256_or_si256(w1,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 2) , 6)); + tmp = _mm256_lddqu_si256 (in + 3); + w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 25)); + w0 = _mm256_srli_epi32(tmp,7); + _mm256_storeu_si256(compressed + 1, w1); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 4) , 12)); + tmp = _mm256_lddqu_si256 (in + 5); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 31)); + w1 = _mm256_srli_epi32(tmp,1); + _mm256_storeu_si256(compressed + 2, w0); + tmp = _mm256_lddqu_si256 (in + 6); + w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 18)); + w0 = _mm256_srli_epi32(tmp,14); + _mm256_storeu_si256(compressed + 3, w1); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 7) , 5)); + tmp = _mm256_lddqu_si256 (in + 8); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 24)); + w1 = _mm256_srli_epi32(tmp,8); + _mm256_storeu_si256(compressed + 4, w0); + w1 = _mm256_or_si256(w1,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 9) , 11)); + tmp = _mm256_lddqu_si256 (in + 10); + w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 30)); + w0 = _mm256_srli_epi32(tmp,2); + _mm256_storeu_si256(compressed + 5, w1); + tmp = _mm256_lddqu_si256 (in + 11); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 17)); + w1 = _mm256_srli_epi32(tmp,15); + _mm256_storeu_si256(compressed + 6, w0); + w1 = _mm256_or_si256(w1,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 12) , 4)); + tmp = _mm256_lddqu_si256 (in + 13); + w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 23)); + w0 = _mm256_srli_epi32(tmp,9); + _mm256_storeu_si256(compressed + 7, w1); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 14) , 10)); + tmp = _mm256_lddqu_si256 (in + 15); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 29)); + w1 = _mm256_srli_epi32(tmp,3); + _mm256_storeu_si256(compressed + 8, w0); + tmp = _mm256_lddqu_si256 (in + 16); + w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 16)); + w0 = _mm256_srli_epi32(tmp,16); + _mm256_storeu_si256(compressed + 9, w1); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 17) , 3)); + tmp = _mm256_lddqu_si256 (in + 18); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 22)); + w1 = _mm256_srli_epi32(tmp,10); + _mm256_storeu_si256(compressed + 10, w0); + w1 = _mm256_or_si256(w1,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 19) , 9)); + tmp = _mm256_lddqu_si256 (in + 20); + w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 28)); + w0 = _mm256_srli_epi32(tmp,4); + _mm256_storeu_si256(compressed + 11, w1); + tmp = _mm256_lddqu_si256 (in + 21); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 15)); + w1 = _mm256_srli_epi32(tmp,17); + _mm256_storeu_si256(compressed + 12, w0); + w1 = _mm256_or_si256(w1,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 22) , 2)); + tmp = _mm256_lddqu_si256 (in + 23); + w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 21)); + w0 = _mm256_srli_epi32(tmp,11); + _mm256_storeu_si256(compressed + 13, w1); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 24) , 8)); + tmp = _mm256_lddqu_si256 (in + 25); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 27)); + w1 = _mm256_srli_epi32(tmp,5); + _mm256_storeu_si256(compressed + 14, w0); + tmp = _mm256_lddqu_si256 (in + 26); + w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 14)); + w0 = _mm256_srli_epi32(tmp,18); + _mm256_storeu_si256(compressed + 15, w1); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 27) , 1)); + tmp = _mm256_lddqu_si256 (in + 28); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 20)); + w1 = _mm256_srli_epi32(tmp,12); + _mm256_storeu_si256(compressed + 16, w0); + w1 = _mm256_or_si256(w1,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 29) , 7)); + tmp = _mm256_lddqu_si256 (in + 30); + w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 26)); + w0 = _mm256_srli_epi32(tmp,6); + _mm256_storeu_si256(compressed + 17, w1); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 31) , 13)); + _mm256_storeu_si256(compressed + 18, w0); +} + + +/* we are going to pack 256 20-bit values, touching 20 256-bit words, using 320 bytes */ +static void avxpackblock20(const uint32_t * pin, __m256i * compressed) { + const __m256i * in = (const __m256i *) pin; + /* we are going to touch 20 256-bit words */ + __m256i w0, w1; + __m256i tmp; /* used to store inputs at word boundary */ + w0 = _mm256_lddqu_si256 (in + 0); + tmp = _mm256_lddqu_si256 (in + 1); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 20)); + w1 = _mm256_srli_epi32(tmp,12); + _mm256_storeu_si256(compressed + 0, w0); + w1 = _mm256_or_si256(w1,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 2) , 8)); + tmp = _mm256_lddqu_si256 (in + 3); + w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 28)); + w0 = _mm256_srli_epi32(tmp,4); + _mm256_storeu_si256(compressed + 1, w1); + tmp = _mm256_lddqu_si256 (in + 4); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 16)); + w1 = _mm256_srli_epi32(tmp,16); + _mm256_storeu_si256(compressed + 2, w0); + w1 = _mm256_or_si256(w1,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 5) , 4)); + tmp = _mm256_lddqu_si256 (in + 6); + w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 24)); + w0 = _mm256_srli_epi32(tmp,8); + _mm256_storeu_si256(compressed + 3, w1); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 7) , 12)); + _mm256_storeu_si256(compressed + 4, w0); + w1 = _mm256_lddqu_si256 (in + 8); + tmp = _mm256_lddqu_si256 (in + 9); + w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 20)); + w0 = _mm256_srli_epi32(tmp,12); + _mm256_storeu_si256(compressed + 5, w1); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 10) , 8)); + tmp = _mm256_lddqu_si256 (in + 11); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 28)); + w1 = _mm256_srli_epi32(tmp,4); + _mm256_storeu_si256(compressed + 6, w0); + tmp = _mm256_lddqu_si256 (in + 12); + w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 16)); + w0 = _mm256_srli_epi32(tmp,16); + _mm256_storeu_si256(compressed + 7, w1); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 13) , 4)); + tmp = _mm256_lddqu_si256 (in + 14); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 24)); + w1 = _mm256_srli_epi32(tmp,8); + _mm256_storeu_si256(compressed + 8, w0); + w1 = _mm256_or_si256(w1,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 15) , 12)); + _mm256_storeu_si256(compressed + 9, w1); + w0 = _mm256_lddqu_si256 (in + 16); + tmp = _mm256_lddqu_si256 (in + 17); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 20)); + w1 = _mm256_srli_epi32(tmp,12); + _mm256_storeu_si256(compressed + 10, w0); + w1 = _mm256_or_si256(w1,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 18) , 8)); + tmp = _mm256_lddqu_si256 (in + 19); + w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 28)); + w0 = _mm256_srli_epi32(tmp,4); + _mm256_storeu_si256(compressed + 11, w1); + tmp = _mm256_lddqu_si256 (in + 20); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 16)); + w1 = _mm256_srli_epi32(tmp,16); + _mm256_storeu_si256(compressed + 12, w0); + w1 = _mm256_or_si256(w1,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 21) , 4)); + tmp = _mm256_lddqu_si256 (in + 22); + w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 24)); + w0 = _mm256_srli_epi32(tmp,8); + _mm256_storeu_si256(compressed + 13, w1); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 23) , 12)); + _mm256_storeu_si256(compressed + 14, w0); + w1 = _mm256_lddqu_si256 (in + 24); + tmp = _mm256_lddqu_si256 (in + 25); + w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 20)); + w0 = _mm256_srli_epi32(tmp,12); + _mm256_storeu_si256(compressed + 15, w1); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 26) , 8)); + tmp = _mm256_lddqu_si256 (in + 27); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 28)); + w1 = _mm256_srli_epi32(tmp,4); + _mm256_storeu_si256(compressed + 16, w0); + tmp = _mm256_lddqu_si256 (in + 28); + w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 16)); + w0 = _mm256_srli_epi32(tmp,16); + _mm256_storeu_si256(compressed + 17, w1); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 29) , 4)); + tmp = _mm256_lddqu_si256 (in + 30); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 24)); + w1 = _mm256_srli_epi32(tmp,8); + _mm256_storeu_si256(compressed + 18, w0); + w1 = _mm256_or_si256(w1,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 31) , 12)); + _mm256_storeu_si256(compressed + 19, w1); +} + + +/* we are going to pack 256 21-bit values, touching 21 256-bit words, using 336 bytes */ +static void avxpackblock21(const uint32_t * pin, __m256i * compressed) { + const __m256i * in = (const __m256i *) pin; + /* we are going to touch 21 256-bit words */ + __m256i w0, w1; + __m256i tmp; /* used to store inputs at word boundary */ + w0 = _mm256_lddqu_si256 (in + 0); + tmp = _mm256_lddqu_si256 (in + 1); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 21)); + w1 = _mm256_srli_epi32(tmp,11); + _mm256_storeu_si256(compressed + 0, w0); + w1 = _mm256_or_si256(w1,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 2) , 10)); + tmp = _mm256_lddqu_si256 (in + 3); + w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 31)); + w0 = _mm256_srli_epi32(tmp,1); + _mm256_storeu_si256(compressed + 1, w1); + tmp = _mm256_lddqu_si256 (in + 4); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 20)); + w1 = _mm256_srli_epi32(tmp,12); + _mm256_storeu_si256(compressed + 2, w0); + w1 = _mm256_or_si256(w1,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 5) , 9)); + tmp = _mm256_lddqu_si256 (in + 6); + w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 30)); + w0 = _mm256_srli_epi32(tmp,2); + _mm256_storeu_si256(compressed + 3, w1); + tmp = _mm256_lddqu_si256 (in + 7); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 19)); + w1 = _mm256_srli_epi32(tmp,13); + _mm256_storeu_si256(compressed + 4, w0); + w1 = _mm256_or_si256(w1,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 8) , 8)); + tmp = _mm256_lddqu_si256 (in + 9); + w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 29)); + w0 = _mm256_srli_epi32(tmp,3); + _mm256_storeu_si256(compressed + 5, w1); + tmp = _mm256_lddqu_si256 (in + 10); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 18)); + w1 = _mm256_srli_epi32(tmp,14); + _mm256_storeu_si256(compressed + 6, w0); + w1 = _mm256_or_si256(w1,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 11) , 7)); + tmp = _mm256_lddqu_si256 (in + 12); + w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 28)); + w0 = _mm256_srli_epi32(tmp,4); + _mm256_storeu_si256(compressed + 7, w1); + tmp = _mm256_lddqu_si256 (in + 13); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 17)); + w1 = _mm256_srli_epi32(tmp,15); + _mm256_storeu_si256(compressed + 8, w0); + w1 = _mm256_or_si256(w1,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 14) , 6)); + tmp = _mm256_lddqu_si256 (in + 15); + w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 27)); + w0 = _mm256_srli_epi32(tmp,5); + _mm256_storeu_si256(compressed + 9, w1); + tmp = _mm256_lddqu_si256 (in + 16); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 16)); + w1 = _mm256_srli_epi32(tmp,16); + _mm256_storeu_si256(compressed + 10, w0); + w1 = _mm256_or_si256(w1,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 17) , 5)); + tmp = _mm256_lddqu_si256 (in + 18); + w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 26)); + w0 = _mm256_srli_epi32(tmp,6); + _mm256_storeu_si256(compressed + 11, w1); + tmp = _mm256_lddqu_si256 (in + 19); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 15)); + w1 = _mm256_srli_epi32(tmp,17); + _mm256_storeu_si256(compressed + 12, w0); + w1 = _mm256_or_si256(w1,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 20) , 4)); + tmp = _mm256_lddqu_si256 (in + 21); + w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 25)); + w0 = _mm256_srli_epi32(tmp,7); + _mm256_storeu_si256(compressed + 13, w1); + tmp = _mm256_lddqu_si256 (in + 22); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 14)); + w1 = _mm256_srli_epi32(tmp,18); + _mm256_storeu_si256(compressed + 14, w0); + w1 = _mm256_or_si256(w1,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 23) , 3)); + tmp = _mm256_lddqu_si256 (in + 24); + w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 24)); + w0 = _mm256_srli_epi32(tmp,8); + _mm256_storeu_si256(compressed + 15, w1); + tmp = _mm256_lddqu_si256 (in + 25); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 13)); + w1 = _mm256_srli_epi32(tmp,19); + _mm256_storeu_si256(compressed + 16, w0); + w1 = _mm256_or_si256(w1,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 26) , 2)); + tmp = _mm256_lddqu_si256 (in + 27); + w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 23)); + w0 = _mm256_srli_epi32(tmp,9); + _mm256_storeu_si256(compressed + 17, w1); + tmp = _mm256_lddqu_si256 (in + 28); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 12)); + w1 = _mm256_srli_epi32(tmp,20); + _mm256_storeu_si256(compressed + 18, w0); + w1 = _mm256_or_si256(w1,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 29) , 1)); + tmp = _mm256_lddqu_si256 (in + 30); + w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 22)); + w0 = _mm256_srli_epi32(tmp,10); + _mm256_storeu_si256(compressed + 19, w1); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 31) , 11)); + _mm256_storeu_si256(compressed + 20, w0); +} + + +/* we are going to pack 256 22-bit values, touching 22 256-bit words, using 352 bytes */ +static void avxpackblock22(const uint32_t * pin, __m256i * compressed) { + const __m256i * in = (const __m256i *) pin; + /* we are going to touch 22 256-bit words */ + __m256i w0, w1; + __m256i tmp; /* used to store inputs at word boundary */ + w0 = _mm256_lddqu_si256 (in + 0); + tmp = _mm256_lddqu_si256 (in + 1); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 22)); + w1 = _mm256_srli_epi32(tmp,10); + _mm256_storeu_si256(compressed + 0, w0); + tmp = _mm256_lddqu_si256 (in + 2); + w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 12)); + w0 = _mm256_srli_epi32(tmp,20); + _mm256_storeu_si256(compressed + 1, w1); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 3) , 2)); + tmp = _mm256_lddqu_si256 (in + 4); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 24)); + w1 = _mm256_srli_epi32(tmp,8); + _mm256_storeu_si256(compressed + 2, w0); + tmp = _mm256_lddqu_si256 (in + 5); + w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 14)); + w0 = _mm256_srli_epi32(tmp,18); + _mm256_storeu_si256(compressed + 3, w1); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 6) , 4)); + tmp = _mm256_lddqu_si256 (in + 7); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 26)); + w1 = _mm256_srli_epi32(tmp,6); + _mm256_storeu_si256(compressed + 4, w0); + tmp = _mm256_lddqu_si256 (in + 8); + w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 16)); + w0 = _mm256_srli_epi32(tmp,16); + _mm256_storeu_si256(compressed + 5, w1); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 9) , 6)); + tmp = _mm256_lddqu_si256 (in + 10); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 28)); + w1 = _mm256_srli_epi32(tmp,4); + _mm256_storeu_si256(compressed + 6, w0); + tmp = _mm256_lddqu_si256 (in + 11); + w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 18)); + w0 = _mm256_srli_epi32(tmp,14); + _mm256_storeu_si256(compressed + 7, w1); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 12) , 8)); + tmp = _mm256_lddqu_si256 (in + 13); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 30)); + w1 = _mm256_srli_epi32(tmp,2); + _mm256_storeu_si256(compressed + 8, w0); + tmp = _mm256_lddqu_si256 (in + 14); + w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 20)); + w0 = _mm256_srli_epi32(tmp,12); + _mm256_storeu_si256(compressed + 9, w1); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 15) , 10)); + _mm256_storeu_si256(compressed + 10, w0); + w1 = _mm256_lddqu_si256 (in + 16); + tmp = _mm256_lddqu_si256 (in + 17); + w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 22)); + w0 = _mm256_srli_epi32(tmp,10); + _mm256_storeu_si256(compressed + 11, w1); + tmp = _mm256_lddqu_si256 (in + 18); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 12)); + w1 = _mm256_srli_epi32(tmp,20); + _mm256_storeu_si256(compressed + 12, w0); + w1 = _mm256_or_si256(w1,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 19) , 2)); + tmp = _mm256_lddqu_si256 (in + 20); + w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 24)); + w0 = _mm256_srli_epi32(tmp,8); + _mm256_storeu_si256(compressed + 13, w1); + tmp = _mm256_lddqu_si256 (in + 21); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 14)); + w1 = _mm256_srli_epi32(tmp,18); + _mm256_storeu_si256(compressed + 14, w0); + w1 = _mm256_or_si256(w1,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 22) , 4)); + tmp = _mm256_lddqu_si256 (in + 23); + w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 26)); + w0 = _mm256_srli_epi32(tmp,6); + _mm256_storeu_si256(compressed + 15, w1); + tmp = _mm256_lddqu_si256 (in + 24); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 16)); + w1 = _mm256_srli_epi32(tmp,16); + _mm256_storeu_si256(compressed + 16, w0); + w1 = _mm256_or_si256(w1,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 25) , 6)); + tmp = _mm256_lddqu_si256 (in + 26); + w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 28)); + w0 = _mm256_srli_epi32(tmp,4); + _mm256_storeu_si256(compressed + 17, w1); + tmp = _mm256_lddqu_si256 (in + 27); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 18)); + w1 = _mm256_srli_epi32(tmp,14); + _mm256_storeu_si256(compressed + 18, w0); + w1 = _mm256_or_si256(w1,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 28) , 8)); + tmp = _mm256_lddqu_si256 (in + 29); + w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 30)); + w0 = _mm256_srli_epi32(tmp,2); + _mm256_storeu_si256(compressed + 19, w1); + tmp = _mm256_lddqu_si256 (in + 30); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 20)); + w1 = _mm256_srli_epi32(tmp,12); + _mm256_storeu_si256(compressed + 20, w0); + w1 = _mm256_or_si256(w1,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 31) , 10)); + _mm256_storeu_si256(compressed + 21, w1); +} + + +/* we are going to pack 256 23-bit values, touching 23 256-bit words, using 368 bytes */ +static void avxpackblock23(const uint32_t * pin, __m256i * compressed) { + const __m256i * in = (const __m256i *) pin; + /* we are going to touch 23 256-bit words */ + __m256i w0, w1; + __m256i tmp; /* used to store inputs at word boundary */ + w0 = _mm256_lddqu_si256 (in + 0); + tmp = _mm256_lddqu_si256 (in + 1); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 23)); + w1 = _mm256_srli_epi32(tmp,9); + _mm256_storeu_si256(compressed + 0, w0); + tmp = _mm256_lddqu_si256 (in + 2); + w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 14)); + w0 = _mm256_srli_epi32(tmp,18); + _mm256_storeu_si256(compressed + 1, w1); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 3) , 5)); + tmp = _mm256_lddqu_si256 (in + 4); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 28)); + w1 = _mm256_srli_epi32(tmp,4); + _mm256_storeu_si256(compressed + 2, w0); + tmp = _mm256_lddqu_si256 (in + 5); + w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 19)); + w0 = _mm256_srli_epi32(tmp,13); + _mm256_storeu_si256(compressed + 3, w1); + tmp = _mm256_lddqu_si256 (in + 6); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 10)); + w1 = _mm256_srli_epi32(tmp,22); + _mm256_storeu_si256(compressed + 4, w0); + w1 = _mm256_or_si256(w1,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 7) , 1)); + tmp = _mm256_lddqu_si256 (in + 8); + w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 24)); + w0 = _mm256_srli_epi32(tmp,8); + _mm256_storeu_si256(compressed + 5, w1); + tmp = _mm256_lddqu_si256 (in + 9); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 15)); + w1 = _mm256_srli_epi32(tmp,17); + _mm256_storeu_si256(compressed + 6, w0); + w1 = _mm256_or_si256(w1,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 10) , 6)); + tmp = _mm256_lddqu_si256 (in + 11); + w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 29)); + w0 = _mm256_srli_epi32(tmp,3); + _mm256_storeu_si256(compressed + 7, w1); + tmp = _mm256_lddqu_si256 (in + 12); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 20)); + w1 = _mm256_srli_epi32(tmp,12); + _mm256_storeu_si256(compressed + 8, w0); + tmp = _mm256_lddqu_si256 (in + 13); + w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 11)); + w0 = _mm256_srli_epi32(tmp,21); + _mm256_storeu_si256(compressed + 9, w1); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 14) , 2)); + tmp = _mm256_lddqu_si256 (in + 15); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 25)); + w1 = _mm256_srli_epi32(tmp,7); + _mm256_storeu_si256(compressed + 10, w0); + tmp = _mm256_lddqu_si256 (in + 16); + w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 16)); + w0 = _mm256_srli_epi32(tmp,16); + _mm256_storeu_si256(compressed + 11, w1); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 17) , 7)); + tmp = _mm256_lddqu_si256 (in + 18); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 30)); + w1 = _mm256_srli_epi32(tmp,2); + _mm256_storeu_si256(compressed + 12, w0); + tmp = _mm256_lddqu_si256 (in + 19); + w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 21)); + w0 = _mm256_srli_epi32(tmp,11); + _mm256_storeu_si256(compressed + 13, w1); + tmp = _mm256_lddqu_si256 (in + 20); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 12)); + w1 = _mm256_srli_epi32(tmp,20); + _mm256_storeu_si256(compressed + 14, w0); + w1 = _mm256_or_si256(w1,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 21) , 3)); + tmp = _mm256_lddqu_si256 (in + 22); + w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 26)); + w0 = _mm256_srli_epi32(tmp,6); + _mm256_storeu_si256(compressed + 15, w1); + tmp = _mm256_lddqu_si256 (in + 23); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 17)); + w1 = _mm256_srli_epi32(tmp,15); + _mm256_storeu_si256(compressed + 16, w0); + w1 = _mm256_or_si256(w1,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 24) , 8)); + tmp = _mm256_lddqu_si256 (in + 25); + w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 31)); + w0 = _mm256_srli_epi32(tmp,1); + _mm256_storeu_si256(compressed + 17, w1); + tmp = _mm256_lddqu_si256 (in + 26); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 22)); + w1 = _mm256_srli_epi32(tmp,10); + _mm256_storeu_si256(compressed + 18, w0); + tmp = _mm256_lddqu_si256 (in + 27); + w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 13)); + w0 = _mm256_srli_epi32(tmp,19); + _mm256_storeu_si256(compressed + 19, w1); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 28) , 4)); + tmp = _mm256_lddqu_si256 (in + 29); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 27)); + w1 = _mm256_srli_epi32(tmp,5); + _mm256_storeu_si256(compressed + 20, w0); + tmp = _mm256_lddqu_si256 (in + 30); + w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 18)); + w0 = _mm256_srli_epi32(tmp,14); + _mm256_storeu_si256(compressed + 21, w1); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 31) , 9)); + _mm256_storeu_si256(compressed + 22, w0); +} + + +/* we are going to pack 256 24-bit values, touching 24 256-bit words, using 384 bytes */ +static void avxpackblock24(const uint32_t * pin, __m256i * compressed) { + const __m256i * in = (const __m256i *) pin; + /* we are going to touch 24 256-bit words */ + __m256i w0, w1; + __m256i tmp; /* used to store inputs at word boundary */ + w0 = _mm256_lddqu_si256 (in + 0); + tmp = _mm256_lddqu_si256 (in + 1); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 24)); + w1 = _mm256_srli_epi32(tmp,8); + _mm256_storeu_si256(compressed + 0, w0); + tmp = _mm256_lddqu_si256 (in + 2); + w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 16)); + w0 = _mm256_srli_epi32(tmp,16); + _mm256_storeu_si256(compressed + 1, w1); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 3) , 8)); + _mm256_storeu_si256(compressed + 2, w0); + w1 = _mm256_lddqu_si256 (in + 4); + tmp = _mm256_lddqu_si256 (in + 5); + w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 24)); + w0 = _mm256_srli_epi32(tmp,8); + _mm256_storeu_si256(compressed + 3, w1); + tmp = _mm256_lddqu_si256 (in + 6); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 16)); + w1 = _mm256_srli_epi32(tmp,16); + _mm256_storeu_si256(compressed + 4, w0); + w1 = _mm256_or_si256(w1,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 7) , 8)); + _mm256_storeu_si256(compressed + 5, w1); + w0 = _mm256_lddqu_si256 (in + 8); + tmp = _mm256_lddqu_si256 (in + 9); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 24)); + w1 = _mm256_srli_epi32(tmp,8); + _mm256_storeu_si256(compressed + 6, w0); + tmp = _mm256_lddqu_si256 (in + 10); + w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 16)); + w0 = _mm256_srli_epi32(tmp,16); + _mm256_storeu_si256(compressed + 7, w1); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 11) , 8)); + _mm256_storeu_si256(compressed + 8, w0); + w1 = _mm256_lddqu_si256 (in + 12); + tmp = _mm256_lddqu_si256 (in + 13); + w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 24)); + w0 = _mm256_srli_epi32(tmp,8); + _mm256_storeu_si256(compressed + 9, w1); + tmp = _mm256_lddqu_si256 (in + 14); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 16)); + w1 = _mm256_srli_epi32(tmp,16); + _mm256_storeu_si256(compressed + 10, w0); + w1 = _mm256_or_si256(w1,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 15) , 8)); + _mm256_storeu_si256(compressed + 11, w1); + w0 = _mm256_lddqu_si256 (in + 16); + tmp = _mm256_lddqu_si256 (in + 17); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 24)); + w1 = _mm256_srli_epi32(tmp,8); + _mm256_storeu_si256(compressed + 12, w0); + tmp = _mm256_lddqu_si256 (in + 18); + w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 16)); + w0 = _mm256_srli_epi32(tmp,16); + _mm256_storeu_si256(compressed + 13, w1); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 19) , 8)); + _mm256_storeu_si256(compressed + 14, w0); + w1 = _mm256_lddqu_si256 (in + 20); + tmp = _mm256_lddqu_si256 (in + 21); + w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 24)); + w0 = _mm256_srli_epi32(tmp,8); + _mm256_storeu_si256(compressed + 15, w1); + tmp = _mm256_lddqu_si256 (in + 22); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 16)); + w1 = _mm256_srli_epi32(tmp,16); + _mm256_storeu_si256(compressed + 16, w0); + w1 = _mm256_or_si256(w1,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 23) , 8)); + _mm256_storeu_si256(compressed + 17, w1); + w0 = _mm256_lddqu_si256 (in + 24); + tmp = _mm256_lddqu_si256 (in + 25); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 24)); + w1 = _mm256_srli_epi32(tmp,8); + _mm256_storeu_si256(compressed + 18, w0); + tmp = _mm256_lddqu_si256 (in + 26); + w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 16)); + w0 = _mm256_srli_epi32(tmp,16); + _mm256_storeu_si256(compressed + 19, w1); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 27) , 8)); + _mm256_storeu_si256(compressed + 20, w0); + w1 = _mm256_lddqu_si256 (in + 28); + tmp = _mm256_lddqu_si256 (in + 29); + w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 24)); + w0 = _mm256_srli_epi32(tmp,8); + _mm256_storeu_si256(compressed + 21, w1); + tmp = _mm256_lddqu_si256 (in + 30); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 16)); + w1 = _mm256_srli_epi32(tmp,16); + _mm256_storeu_si256(compressed + 22, w0); + w1 = _mm256_or_si256(w1,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 31) , 8)); + _mm256_storeu_si256(compressed + 23, w1); +} + + +/* we are going to pack 256 25-bit values, touching 25 256-bit words, using 400 bytes */ +static void avxpackblock25(const uint32_t * pin, __m256i * compressed) { + const __m256i * in = (const __m256i *) pin; + /* we are going to touch 25 256-bit words */ + __m256i w0, w1; + __m256i tmp; /* used to store inputs at word boundary */ + w0 = _mm256_lddqu_si256 (in + 0); + tmp = _mm256_lddqu_si256 (in + 1); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 25)); + w1 = _mm256_srli_epi32(tmp,7); + _mm256_storeu_si256(compressed + 0, w0); + tmp = _mm256_lddqu_si256 (in + 2); + w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 18)); + w0 = _mm256_srli_epi32(tmp,14); + _mm256_storeu_si256(compressed + 1, w1); + tmp = _mm256_lddqu_si256 (in + 3); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 11)); + w1 = _mm256_srli_epi32(tmp,21); + _mm256_storeu_si256(compressed + 2, w0); + w1 = _mm256_or_si256(w1,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 4) , 4)); + tmp = _mm256_lddqu_si256 (in + 5); + w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 29)); + w0 = _mm256_srli_epi32(tmp,3); + _mm256_storeu_si256(compressed + 3, w1); + tmp = _mm256_lddqu_si256 (in + 6); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 22)); + w1 = _mm256_srli_epi32(tmp,10); + _mm256_storeu_si256(compressed + 4, w0); + tmp = _mm256_lddqu_si256 (in + 7); + w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 15)); + w0 = _mm256_srli_epi32(tmp,17); + _mm256_storeu_si256(compressed + 5, w1); + tmp = _mm256_lddqu_si256 (in + 8); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 8)); + w1 = _mm256_srli_epi32(tmp,24); + _mm256_storeu_si256(compressed + 6, w0); + w1 = _mm256_or_si256(w1,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 9) , 1)); + tmp = _mm256_lddqu_si256 (in + 10); + w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 26)); + w0 = _mm256_srli_epi32(tmp,6); + _mm256_storeu_si256(compressed + 7, w1); + tmp = _mm256_lddqu_si256 (in + 11); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 19)); + w1 = _mm256_srli_epi32(tmp,13); + _mm256_storeu_si256(compressed + 8, w0); + tmp = _mm256_lddqu_si256 (in + 12); + w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 12)); + w0 = _mm256_srli_epi32(tmp,20); + _mm256_storeu_si256(compressed + 9, w1); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 13) , 5)); + tmp = _mm256_lddqu_si256 (in + 14); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 30)); + w1 = _mm256_srli_epi32(tmp,2); + _mm256_storeu_si256(compressed + 10, w0); + tmp = _mm256_lddqu_si256 (in + 15); + w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 23)); + w0 = _mm256_srli_epi32(tmp,9); + _mm256_storeu_si256(compressed + 11, w1); + tmp = _mm256_lddqu_si256 (in + 16); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 16)); + w1 = _mm256_srli_epi32(tmp,16); + _mm256_storeu_si256(compressed + 12, w0); + tmp = _mm256_lddqu_si256 (in + 17); + w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 9)); + w0 = _mm256_srli_epi32(tmp,23); + _mm256_storeu_si256(compressed + 13, w1); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 18) , 2)); + tmp = _mm256_lddqu_si256 (in + 19); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 27)); + w1 = _mm256_srli_epi32(tmp,5); + _mm256_storeu_si256(compressed + 14, w0); + tmp = _mm256_lddqu_si256 (in + 20); + w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 20)); + w0 = _mm256_srli_epi32(tmp,12); + _mm256_storeu_si256(compressed + 15, w1); + tmp = _mm256_lddqu_si256 (in + 21); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 13)); + w1 = _mm256_srli_epi32(tmp,19); + _mm256_storeu_si256(compressed + 16, w0); + w1 = _mm256_or_si256(w1,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 22) , 6)); + tmp = _mm256_lddqu_si256 (in + 23); + w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 31)); + w0 = _mm256_srli_epi32(tmp,1); + _mm256_storeu_si256(compressed + 17, w1); + tmp = _mm256_lddqu_si256 (in + 24); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 24)); + w1 = _mm256_srli_epi32(tmp,8); + _mm256_storeu_si256(compressed + 18, w0); + tmp = _mm256_lddqu_si256 (in + 25); + w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 17)); + w0 = _mm256_srli_epi32(tmp,15); + _mm256_storeu_si256(compressed + 19, w1); + tmp = _mm256_lddqu_si256 (in + 26); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 10)); + w1 = _mm256_srli_epi32(tmp,22); + _mm256_storeu_si256(compressed + 20, w0); + w1 = _mm256_or_si256(w1,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 27) , 3)); + tmp = _mm256_lddqu_si256 (in + 28); + w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 28)); + w0 = _mm256_srli_epi32(tmp,4); + _mm256_storeu_si256(compressed + 21, w1); + tmp = _mm256_lddqu_si256 (in + 29); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 21)); + w1 = _mm256_srli_epi32(tmp,11); + _mm256_storeu_si256(compressed + 22, w0); + tmp = _mm256_lddqu_si256 (in + 30); + w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 14)); + w0 = _mm256_srli_epi32(tmp,18); + _mm256_storeu_si256(compressed + 23, w1); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 31) , 7)); + _mm256_storeu_si256(compressed + 24, w0); +} + + +/* we are going to pack 256 26-bit values, touching 26 256-bit words, using 416 bytes */ +static void avxpackblock26(const uint32_t * pin, __m256i * compressed) { + const __m256i * in = (const __m256i *) pin; + /* we are going to touch 26 256-bit words */ + __m256i w0, w1; + __m256i tmp; /* used to store inputs at word boundary */ + w0 = _mm256_lddqu_si256 (in + 0); + tmp = _mm256_lddqu_si256 (in + 1); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 26)); + w1 = _mm256_srli_epi32(tmp,6); + _mm256_storeu_si256(compressed + 0, w0); + tmp = _mm256_lddqu_si256 (in + 2); + w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 20)); + w0 = _mm256_srli_epi32(tmp,12); + _mm256_storeu_si256(compressed + 1, w1); + tmp = _mm256_lddqu_si256 (in + 3); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 14)); + w1 = _mm256_srli_epi32(tmp,18); + _mm256_storeu_si256(compressed + 2, w0); + tmp = _mm256_lddqu_si256 (in + 4); + w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 8)); + w0 = _mm256_srli_epi32(tmp,24); + _mm256_storeu_si256(compressed + 3, w1); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 5) , 2)); + tmp = _mm256_lddqu_si256 (in + 6); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 28)); + w1 = _mm256_srli_epi32(tmp,4); + _mm256_storeu_si256(compressed + 4, w0); + tmp = _mm256_lddqu_si256 (in + 7); + w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 22)); + w0 = _mm256_srli_epi32(tmp,10); + _mm256_storeu_si256(compressed + 5, w1); + tmp = _mm256_lddqu_si256 (in + 8); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 16)); + w1 = _mm256_srli_epi32(tmp,16); + _mm256_storeu_si256(compressed + 6, w0); + tmp = _mm256_lddqu_si256 (in + 9); + w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 10)); + w0 = _mm256_srli_epi32(tmp,22); + _mm256_storeu_si256(compressed + 7, w1); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 10) , 4)); + tmp = _mm256_lddqu_si256 (in + 11); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 30)); + w1 = _mm256_srli_epi32(tmp,2); + _mm256_storeu_si256(compressed + 8, w0); + tmp = _mm256_lddqu_si256 (in + 12); + w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 24)); + w0 = _mm256_srli_epi32(tmp,8); + _mm256_storeu_si256(compressed + 9, w1); + tmp = _mm256_lddqu_si256 (in + 13); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 18)); + w1 = _mm256_srli_epi32(tmp,14); + _mm256_storeu_si256(compressed + 10, w0); + tmp = _mm256_lddqu_si256 (in + 14); + w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 12)); + w0 = _mm256_srli_epi32(tmp,20); + _mm256_storeu_si256(compressed + 11, w1); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 15) , 6)); + _mm256_storeu_si256(compressed + 12, w0); + w1 = _mm256_lddqu_si256 (in + 16); + tmp = _mm256_lddqu_si256 (in + 17); + w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 26)); + w0 = _mm256_srli_epi32(tmp,6); + _mm256_storeu_si256(compressed + 13, w1); + tmp = _mm256_lddqu_si256 (in + 18); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 20)); + w1 = _mm256_srli_epi32(tmp,12); + _mm256_storeu_si256(compressed + 14, w0); + tmp = _mm256_lddqu_si256 (in + 19); + w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 14)); + w0 = _mm256_srli_epi32(tmp,18); + _mm256_storeu_si256(compressed + 15, w1); + tmp = _mm256_lddqu_si256 (in + 20); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 8)); + w1 = _mm256_srli_epi32(tmp,24); + _mm256_storeu_si256(compressed + 16, w0); + w1 = _mm256_or_si256(w1,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 21) , 2)); + tmp = _mm256_lddqu_si256 (in + 22); + w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 28)); + w0 = _mm256_srli_epi32(tmp,4); + _mm256_storeu_si256(compressed + 17, w1); + tmp = _mm256_lddqu_si256 (in + 23); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 22)); + w1 = _mm256_srli_epi32(tmp,10); + _mm256_storeu_si256(compressed + 18, w0); + tmp = _mm256_lddqu_si256 (in + 24); + w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 16)); + w0 = _mm256_srli_epi32(tmp,16); + _mm256_storeu_si256(compressed + 19, w1); + tmp = _mm256_lddqu_si256 (in + 25); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 10)); + w1 = _mm256_srli_epi32(tmp,22); + _mm256_storeu_si256(compressed + 20, w0); + w1 = _mm256_or_si256(w1,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 26) , 4)); + tmp = _mm256_lddqu_si256 (in + 27); + w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 30)); + w0 = _mm256_srli_epi32(tmp,2); + _mm256_storeu_si256(compressed + 21, w1); + tmp = _mm256_lddqu_si256 (in + 28); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 24)); + w1 = _mm256_srli_epi32(tmp,8); + _mm256_storeu_si256(compressed + 22, w0); + tmp = _mm256_lddqu_si256 (in + 29); + w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 18)); + w0 = _mm256_srli_epi32(tmp,14); + _mm256_storeu_si256(compressed + 23, w1); + tmp = _mm256_lddqu_si256 (in + 30); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 12)); + w1 = _mm256_srli_epi32(tmp,20); + _mm256_storeu_si256(compressed + 24, w0); + w1 = _mm256_or_si256(w1,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 31) , 6)); + _mm256_storeu_si256(compressed + 25, w1); +} + + +/* we are going to pack 256 27-bit values, touching 27 256-bit words, using 432 bytes */ +static void avxpackblock27(const uint32_t * pin, __m256i * compressed) { + const __m256i * in = (const __m256i *) pin; + /* we are going to touch 27 256-bit words */ + __m256i w0, w1; + __m256i tmp; /* used to store inputs at word boundary */ + w0 = _mm256_lddqu_si256 (in + 0); + tmp = _mm256_lddqu_si256 (in + 1); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 27)); + w1 = _mm256_srli_epi32(tmp,5); + _mm256_storeu_si256(compressed + 0, w0); + tmp = _mm256_lddqu_si256 (in + 2); + w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 22)); + w0 = _mm256_srli_epi32(tmp,10); + _mm256_storeu_si256(compressed + 1, w1); + tmp = _mm256_lddqu_si256 (in + 3); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 17)); + w1 = _mm256_srli_epi32(tmp,15); + _mm256_storeu_si256(compressed + 2, w0); + tmp = _mm256_lddqu_si256 (in + 4); + w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 12)); + w0 = _mm256_srli_epi32(tmp,20); + _mm256_storeu_si256(compressed + 3, w1); + tmp = _mm256_lddqu_si256 (in + 5); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 7)); + w1 = _mm256_srli_epi32(tmp,25); + _mm256_storeu_si256(compressed + 4, w0); + w1 = _mm256_or_si256(w1,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 6) , 2)); + tmp = _mm256_lddqu_si256 (in + 7); + w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 29)); + w0 = _mm256_srli_epi32(tmp,3); + _mm256_storeu_si256(compressed + 5, w1); + tmp = _mm256_lddqu_si256 (in + 8); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 24)); + w1 = _mm256_srli_epi32(tmp,8); + _mm256_storeu_si256(compressed + 6, w0); + tmp = _mm256_lddqu_si256 (in + 9); + w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 19)); + w0 = _mm256_srli_epi32(tmp,13); + _mm256_storeu_si256(compressed + 7, w1); + tmp = _mm256_lddqu_si256 (in + 10); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 14)); + w1 = _mm256_srli_epi32(tmp,18); + _mm256_storeu_si256(compressed + 8, w0); + tmp = _mm256_lddqu_si256 (in + 11); + w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 9)); + w0 = _mm256_srli_epi32(tmp,23); + _mm256_storeu_si256(compressed + 9, w1); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 12) , 4)); + tmp = _mm256_lddqu_si256 (in + 13); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 31)); + w1 = _mm256_srli_epi32(tmp,1); + _mm256_storeu_si256(compressed + 10, w0); + tmp = _mm256_lddqu_si256 (in + 14); + w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 26)); + w0 = _mm256_srli_epi32(tmp,6); + _mm256_storeu_si256(compressed + 11, w1); + tmp = _mm256_lddqu_si256 (in + 15); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 21)); + w1 = _mm256_srli_epi32(tmp,11); + _mm256_storeu_si256(compressed + 12, w0); + tmp = _mm256_lddqu_si256 (in + 16); + w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 16)); + w0 = _mm256_srli_epi32(tmp,16); + _mm256_storeu_si256(compressed + 13, w1); + tmp = _mm256_lddqu_si256 (in + 17); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 11)); + w1 = _mm256_srli_epi32(tmp,21); + _mm256_storeu_si256(compressed + 14, w0); + tmp = _mm256_lddqu_si256 (in + 18); + w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 6)); + w0 = _mm256_srli_epi32(tmp,26); + _mm256_storeu_si256(compressed + 15, w1); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 19) , 1)); + tmp = _mm256_lddqu_si256 (in + 20); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 28)); + w1 = _mm256_srli_epi32(tmp,4); + _mm256_storeu_si256(compressed + 16, w0); + tmp = _mm256_lddqu_si256 (in + 21); + w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 23)); + w0 = _mm256_srli_epi32(tmp,9); + _mm256_storeu_si256(compressed + 17, w1); + tmp = _mm256_lddqu_si256 (in + 22); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 18)); + w1 = _mm256_srli_epi32(tmp,14); + _mm256_storeu_si256(compressed + 18, w0); + tmp = _mm256_lddqu_si256 (in + 23); + w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 13)); + w0 = _mm256_srli_epi32(tmp,19); + _mm256_storeu_si256(compressed + 19, w1); + tmp = _mm256_lddqu_si256 (in + 24); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 8)); + w1 = _mm256_srli_epi32(tmp,24); + _mm256_storeu_si256(compressed + 20, w0); + w1 = _mm256_or_si256(w1,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 25) , 3)); + tmp = _mm256_lddqu_si256 (in + 26); + w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 30)); + w0 = _mm256_srli_epi32(tmp,2); + _mm256_storeu_si256(compressed + 21, w1); + tmp = _mm256_lddqu_si256 (in + 27); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 25)); + w1 = _mm256_srli_epi32(tmp,7); + _mm256_storeu_si256(compressed + 22, w0); + tmp = _mm256_lddqu_si256 (in + 28); + w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 20)); + w0 = _mm256_srli_epi32(tmp,12); + _mm256_storeu_si256(compressed + 23, w1); + tmp = _mm256_lddqu_si256 (in + 29); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 15)); + w1 = _mm256_srli_epi32(tmp,17); + _mm256_storeu_si256(compressed + 24, w0); + tmp = _mm256_lddqu_si256 (in + 30); + w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 10)); + w0 = _mm256_srli_epi32(tmp,22); + _mm256_storeu_si256(compressed + 25, w1); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 31) , 5)); + _mm256_storeu_si256(compressed + 26, w0); +} + + +/* we are going to pack 256 28-bit values, touching 28 256-bit words, using 448 bytes */ +static void avxpackblock28(const uint32_t * pin, __m256i * compressed) { + const __m256i * in = (const __m256i *) pin; + /* we are going to touch 28 256-bit words */ + __m256i w0, w1; + __m256i tmp; /* used to store inputs at word boundary */ + w0 = _mm256_lddqu_si256 (in + 0); + tmp = _mm256_lddqu_si256 (in + 1); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 28)); + w1 = _mm256_srli_epi32(tmp,4); + _mm256_storeu_si256(compressed + 0, w0); + tmp = _mm256_lddqu_si256 (in + 2); + w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 24)); + w0 = _mm256_srli_epi32(tmp,8); + _mm256_storeu_si256(compressed + 1, w1); + tmp = _mm256_lddqu_si256 (in + 3); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 20)); + w1 = _mm256_srli_epi32(tmp,12); + _mm256_storeu_si256(compressed + 2, w0); + tmp = _mm256_lddqu_si256 (in + 4); + w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 16)); + w0 = _mm256_srli_epi32(tmp,16); + _mm256_storeu_si256(compressed + 3, w1); + tmp = _mm256_lddqu_si256 (in + 5); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 12)); + w1 = _mm256_srli_epi32(tmp,20); + _mm256_storeu_si256(compressed + 4, w0); + tmp = _mm256_lddqu_si256 (in + 6); + w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 8)); + w0 = _mm256_srli_epi32(tmp,24); + _mm256_storeu_si256(compressed + 5, w1); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 7) , 4)); + _mm256_storeu_si256(compressed + 6, w0); + w1 = _mm256_lddqu_si256 (in + 8); + tmp = _mm256_lddqu_si256 (in + 9); + w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 28)); + w0 = _mm256_srli_epi32(tmp,4); + _mm256_storeu_si256(compressed + 7, w1); + tmp = _mm256_lddqu_si256 (in + 10); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 24)); + w1 = _mm256_srli_epi32(tmp,8); + _mm256_storeu_si256(compressed + 8, w0); + tmp = _mm256_lddqu_si256 (in + 11); + w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 20)); + w0 = _mm256_srli_epi32(tmp,12); + _mm256_storeu_si256(compressed + 9, w1); + tmp = _mm256_lddqu_si256 (in + 12); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 16)); + w1 = _mm256_srli_epi32(tmp,16); + _mm256_storeu_si256(compressed + 10, w0); + tmp = _mm256_lddqu_si256 (in + 13); + w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 12)); + w0 = _mm256_srli_epi32(tmp,20); + _mm256_storeu_si256(compressed + 11, w1); + tmp = _mm256_lddqu_si256 (in + 14); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 8)); + w1 = _mm256_srli_epi32(tmp,24); + _mm256_storeu_si256(compressed + 12, w0); + w1 = _mm256_or_si256(w1,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 15) , 4)); + _mm256_storeu_si256(compressed + 13, w1); + w0 = _mm256_lddqu_si256 (in + 16); + tmp = _mm256_lddqu_si256 (in + 17); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 28)); + w1 = _mm256_srli_epi32(tmp,4); + _mm256_storeu_si256(compressed + 14, w0); + tmp = _mm256_lddqu_si256 (in + 18); + w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 24)); + w0 = _mm256_srli_epi32(tmp,8); + _mm256_storeu_si256(compressed + 15, w1); + tmp = _mm256_lddqu_si256 (in + 19); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 20)); + w1 = _mm256_srli_epi32(tmp,12); + _mm256_storeu_si256(compressed + 16, w0); + tmp = _mm256_lddqu_si256 (in + 20); + w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 16)); + w0 = _mm256_srli_epi32(tmp,16); + _mm256_storeu_si256(compressed + 17, w1); + tmp = _mm256_lddqu_si256 (in + 21); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 12)); + w1 = _mm256_srli_epi32(tmp,20); + _mm256_storeu_si256(compressed + 18, w0); + tmp = _mm256_lddqu_si256 (in + 22); + w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 8)); + w0 = _mm256_srli_epi32(tmp,24); + _mm256_storeu_si256(compressed + 19, w1); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 23) , 4)); + _mm256_storeu_si256(compressed + 20, w0); + w1 = _mm256_lddqu_si256 (in + 24); + tmp = _mm256_lddqu_si256 (in + 25); + w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 28)); + w0 = _mm256_srli_epi32(tmp,4); + _mm256_storeu_si256(compressed + 21, w1); + tmp = _mm256_lddqu_si256 (in + 26); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 24)); + w1 = _mm256_srli_epi32(tmp,8); + _mm256_storeu_si256(compressed + 22, w0); + tmp = _mm256_lddqu_si256 (in + 27); + w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 20)); + w0 = _mm256_srli_epi32(tmp,12); + _mm256_storeu_si256(compressed + 23, w1); + tmp = _mm256_lddqu_si256 (in + 28); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 16)); + w1 = _mm256_srli_epi32(tmp,16); + _mm256_storeu_si256(compressed + 24, w0); + tmp = _mm256_lddqu_si256 (in + 29); + w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 12)); + w0 = _mm256_srli_epi32(tmp,20); + _mm256_storeu_si256(compressed + 25, w1); + tmp = _mm256_lddqu_si256 (in + 30); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 8)); + w1 = _mm256_srli_epi32(tmp,24); + _mm256_storeu_si256(compressed + 26, w0); + w1 = _mm256_or_si256(w1,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 31) , 4)); + _mm256_storeu_si256(compressed + 27, w1); +} + + +/* we are going to pack 256 29-bit values, touching 29 256-bit words, using 464 bytes */ +static void avxpackblock29(const uint32_t * pin, __m256i * compressed) { + const __m256i * in = (const __m256i *) pin; + /* we are going to touch 29 256-bit words */ + __m256i w0, w1; + __m256i tmp; /* used to store inputs at word boundary */ + w0 = _mm256_lddqu_si256 (in + 0); + tmp = _mm256_lddqu_si256 (in + 1); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 29)); + w1 = _mm256_srli_epi32(tmp,3); + _mm256_storeu_si256(compressed + 0, w0); + tmp = _mm256_lddqu_si256 (in + 2); + w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 26)); + w0 = _mm256_srli_epi32(tmp,6); + _mm256_storeu_si256(compressed + 1, w1); + tmp = _mm256_lddqu_si256 (in + 3); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 23)); + w1 = _mm256_srli_epi32(tmp,9); + _mm256_storeu_si256(compressed + 2, w0); + tmp = _mm256_lddqu_si256 (in + 4); + w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 20)); + w0 = _mm256_srli_epi32(tmp,12); + _mm256_storeu_si256(compressed + 3, w1); + tmp = _mm256_lddqu_si256 (in + 5); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 17)); + w1 = _mm256_srli_epi32(tmp,15); + _mm256_storeu_si256(compressed + 4, w0); + tmp = _mm256_lddqu_si256 (in + 6); + w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 14)); + w0 = _mm256_srli_epi32(tmp,18); + _mm256_storeu_si256(compressed + 5, w1); + tmp = _mm256_lddqu_si256 (in + 7); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 11)); + w1 = _mm256_srli_epi32(tmp,21); + _mm256_storeu_si256(compressed + 6, w0); + tmp = _mm256_lddqu_si256 (in + 8); + w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 8)); + w0 = _mm256_srli_epi32(tmp,24); + _mm256_storeu_si256(compressed + 7, w1); + tmp = _mm256_lddqu_si256 (in + 9); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 5)); + w1 = _mm256_srli_epi32(tmp,27); + _mm256_storeu_si256(compressed + 8, w0); + w1 = _mm256_or_si256(w1,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 10) , 2)); + tmp = _mm256_lddqu_si256 (in + 11); + w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 31)); + w0 = _mm256_srli_epi32(tmp,1); + _mm256_storeu_si256(compressed + 9, w1); + tmp = _mm256_lddqu_si256 (in + 12); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 28)); + w1 = _mm256_srli_epi32(tmp,4); + _mm256_storeu_si256(compressed + 10, w0); + tmp = _mm256_lddqu_si256 (in + 13); + w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 25)); + w0 = _mm256_srli_epi32(tmp,7); + _mm256_storeu_si256(compressed + 11, w1); + tmp = _mm256_lddqu_si256 (in + 14); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 22)); + w1 = _mm256_srli_epi32(tmp,10); + _mm256_storeu_si256(compressed + 12, w0); + tmp = _mm256_lddqu_si256 (in + 15); + w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 19)); + w0 = _mm256_srli_epi32(tmp,13); + _mm256_storeu_si256(compressed + 13, w1); + tmp = _mm256_lddqu_si256 (in + 16); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 16)); + w1 = _mm256_srli_epi32(tmp,16); + _mm256_storeu_si256(compressed + 14, w0); + tmp = _mm256_lddqu_si256 (in + 17); + w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 13)); + w0 = _mm256_srli_epi32(tmp,19); + _mm256_storeu_si256(compressed + 15, w1); + tmp = _mm256_lddqu_si256 (in + 18); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 10)); + w1 = _mm256_srli_epi32(tmp,22); + _mm256_storeu_si256(compressed + 16, w0); + tmp = _mm256_lddqu_si256 (in + 19); + w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 7)); + w0 = _mm256_srli_epi32(tmp,25); + _mm256_storeu_si256(compressed + 17, w1); + tmp = _mm256_lddqu_si256 (in + 20); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 4)); + w1 = _mm256_srli_epi32(tmp,28); + _mm256_storeu_si256(compressed + 18, w0); + w1 = _mm256_or_si256(w1,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 21) , 1)); + tmp = _mm256_lddqu_si256 (in + 22); + w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 30)); + w0 = _mm256_srli_epi32(tmp,2); + _mm256_storeu_si256(compressed + 19, w1); + tmp = _mm256_lddqu_si256 (in + 23); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 27)); + w1 = _mm256_srli_epi32(tmp,5); + _mm256_storeu_si256(compressed + 20, w0); + tmp = _mm256_lddqu_si256 (in + 24); + w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 24)); + w0 = _mm256_srli_epi32(tmp,8); + _mm256_storeu_si256(compressed + 21, w1); + tmp = _mm256_lddqu_si256 (in + 25); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 21)); + w1 = _mm256_srli_epi32(tmp,11); + _mm256_storeu_si256(compressed + 22, w0); + tmp = _mm256_lddqu_si256 (in + 26); + w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 18)); + w0 = _mm256_srli_epi32(tmp,14); + _mm256_storeu_si256(compressed + 23, w1); + tmp = _mm256_lddqu_si256 (in + 27); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 15)); + w1 = _mm256_srli_epi32(tmp,17); + _mm256_storeu_si256(compressed + 24, w0); + tmp = _mm256_lddqu_si256 (in + 28); + w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 12)); + w0 = _mm256_srli_epi32(tmp,20); + _mm256_storeu_si256(compressed + 25, w1); + tmp = _mm256_lddqu_si256 (in + 29); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 9)); + w1 = _mm256_srli_epi32(tmp,23); + _mm256_storeu_si256(compressed + 26, w0); + tmp = _mm256_lddqu_si256 (in + 30); + w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 6)); + w0 = _mm256_srli_epi32(tmp,26); + _mm256_storeu_si256(compressed + 27, w1); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 31) , 3)); + _mm256_storeu_si256(compressed + 28, w0); +} + + +/* we are going to pack 256 30-bit values, touching 30 256-bit words, using 480 bytes */ +static void avxpackblock30(const uint32_t * pin, __m256i * compressed) { + const __m256i * in = (const __m256i *) pin; + /* we are going to touch 30 256-bit words */ + __m256i w0, w1; + __m256i tmp; /* used to store inputs at word boundary */ + w0 = _mm256_lddqu_si256 (in + 0); + tmp = _mm256_lddqu_si256 (in + 1); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 30)); + w1 = _mm256_srli_epi32(tmp,2); + _mm256_storeu_si256(compressed + 0, w0); + tmp = _mm256_lddqu_si256 (in + 2); + w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 28)); + w0 = _mm256_srli_epi32(tmp,4); + _mm256_storeu_si256(compressed + 1, w1); + tmp = _mm256_lddqu_si256 (in + 3); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 26)); + w1 = _mm256_srli_epi32(tmp,6); + _mm256_storeu_si256(compressed + 2, w0); + tmp = _mm256_lddqu_si256 (in + 4); + w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 24)); + w0 = _mm256_srli_epi32(tmp,8); + _mm256_storeu_si256(compressed + 3, w1); + tmp = _mm256_lddqu_si256 (in + 5); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 22)); + w1 = _mm256_srli_epi32(tmp,10); + _mm256_storeu_si256(compressed + 4, w0); + tmp = _mm256_lddqu_si256 (in + 6); + w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 20)); + w0 = _mm256_srli_epi32(tmp,12); + _mm256_storeu_si256(compressed + 5, w1); + tmp = _mm256_lddqu_si256 (in + 7); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 18)); + w1 = _mm256_srli_epi32(tmp,14); + _mm256_storeu_si256(compressed + 6, w0); + tmp = _mm256_lddqu_si256 (in + 8); + w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 16)); + w0 = _mm256_srli_epi32(tmp,16); + _mm256_storeu_si256(compressed + 7, w1); + tmp = _mm256_lddqu_si256 (in + 9); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 14)); + w1 = _mm256_srli_epi32(tmp,18); + _mm256_storeu_si256(compressed + 8, w0); + tmp = _mm256_lddqu_si256 (in + 10); + w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 12)); + w0 = _mm256_srli_epi32(tmp,20); + _mm256_storeu_si256(compressed + 9, w1); + tmp = _mm256_lddqu_si256 (in + 11); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 10)); + w1 = _mm256_srli_epi32(tmp,22); + _mm256_storeu_si256(compressed + 10, w0); + tmp = _mm256_lddqu_si256 (in + 12); + w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 8)); + w0 = _mm256_srli_epi32(tmp,24); + _mm256_storeu_si256(compressed + 11, w1); + tmp = _mm256_lddqu_si256 (in + 13); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 6)); + w1 = _mm256_srli_epi32(tmp,26); + _mm256_storeu_si256(compressed + 12, w0); + tmp = _mm256_lddqu_si256 (in + 14); + w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 4)); + w0 = _mm256_srli_epi32(tmp,28); + _mm256_storeu_si256(compressed + 13, w1); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 15) , 2)); + _mm256_storeu_si256(compressed + 14, w0); + w1 = _mm256_lddqu_si256 (in + 16); + tmp = _mm256_lddqu_si256 (in + 17); + w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 30)); + w0 = _mm256_srli_epi32(tmp,2); + _mm256_storeu_si256(compressed + 15, w1); + tmp = _mm256_lddqu_si256 (in + 18); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 28)); + w1 = _mm256_srli_epi32(tmp,4); + _mm256_storeu_si256(compressed + 16, w0); + tmp = _mm256_lddqu_si256 (in + 19); + w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 26)); + w0 = _mm256_srli_epi32(tmp,6); + _mm256_storeu_si256(compressed + 17, w1); + tmp = _mm256_lddqu_si256 (in + 20); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 24)); + w1 = _mm256_srli_epi32(tmp,8); + _mm256_storeu_si256(compressed + 18, w0); + tmp = _mm256_lddqu_si256 (in + 21); + w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 22)); + w0 = _mm256_srli_epi32(tmp,10); + _mm256_storeu_si256(compressed + 19, w1); + tmp = _mm256_lddqu_si256 (in + 22); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 20)); + w1 = _mm256_srli_epi32(tmp,12); + _mm256_storeu_si256(compressed + 20, w0); + tmp = _mm256_lddqu_si256 (in + 23); + w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 18)); + w0 = _mm256_srli_epi32(tmp,14); + _mm256_storeu_si256(compressed + 21, w1); + tmp = _mm256_lddqu_si256 (in + 24); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 16)); + w1 = _mm256_srli_epi32(tmp,16); + _mm256_storeu_si256(compressed + 22, w0); + tmp = _mm256_lddqu_si256 (in + 25); + w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 14)); + w0 = _mm256_srli_epi32(tmp,18); + _mm256_storeu_si256(compressed + 23, w1); + tmp = _mm256_lddqu_si256 (in + 26); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 12)); + w1 = _mm256_srli_epi32(tmp,20); + _mm256_storeu_si256(compressed + 24, w0); + tmp = _mm256_lddqu_si256 (in + 27); + w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 10)); + w0 = _mm256_srli_epi32(tmp,22); + _mm256_storeu_si256(compressed + 25, w1); + tmp = _mm256_lddqu_si256 (in + 28); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 8)); + w1 = _mm256_srli_epi32(tmp,24); + _mm256_storeu_si256(compressed + 26, w0); + tmp = _mm256_lddqu_si256 (in + 29); + w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 6)); + w0 = _mm256_srli_epi32(tmp,26); + _mm256_storeu_si256(compressed + 27, w1); + tmp = _mm256_lddqu_si256 (in + 30); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 4)); + w1 = _mm256_srli_epi32(tmp,28); + _mm256_storeu_si256(compressed + 28, w0); + w1 = _mm256_or_si256(w1,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 31) , 2)); + _mm256_storeu_si256(compressed + 29, w1); +} + + +/* we are going to pack 256 31-bit values, touching 31 256-bit words, using 496 bytes */ +static void avxpackblock31(const uint32_t * pin, __m256i * compressed) { + const __m256i * in = (const __m256i *) pin; + /* we are going to touch 31 256-bit words */ + __m256i w0, w1; + __m256i tmp; /* used to store inputs at word boundary */ + w0 = _mm256_lddqu_si256 (in + 0); + tmp = _mm256_lddqu_si256 (in + 1); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 31)); + w1 = _mm256_srli_epi32(tmp,1); + _mm256_storeu_si256(compressed + 0, w0); + tmp = _mm256_lddqu_si256 (in + 2); + w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 30)); + w0 = _mm256_srli_epi32(tmp,2); + _mm256_storeu_si256(compressed + 1, w1); + tmp = _mm256_lddqu_si256 (in + 3); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 29)); + w1 = _mm256_srli_epi32(tmp,3); + _mm256_storeu_si256(compressed + 2, w0); + tmp = _mm256_lddqu_si256 (in + 4); + w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 28)); + w0 = _mm256_srli_epi32(tmp,4); + _mm256_storeu_si256(compressed + 3, w1); + tmp = _mm256_lddqu_si256 (in + 5); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 27)); + w1 = _mm256_srli_epi32(tmp,5); + _mm256_storeu_si256(compressed + 4, w0); + tmp = _mm256_lddqu_si256 (in + 6); + w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 26)); + w0 = _mm256_srli_epi32(tmp,6); + _mm256_storeu_si256(compressed + 5, w1); + tmp = _mm256_lddqu_si256 (in + 7); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 25)); + w1 = _mm256_srli_epi32(tmp,7); + _mm256_storeu_si256(compressed + 6, w0); + tmp = _mm256_lddqu_si256 (in + 8); + w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 24)); + w0 = _mm256_srli_epi32(tmp,8); + _mm256_storeu_si256(compressed + 7, w1); + tmp = _mm256_lddqu_si256 (in + 9); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 23)); + w1 = _mm256_srli_epi32(tmp,9); + _mm256_storeu_si256(compressed + 8, w0); + tmp = _mm256_lddqu_si256 (in + 10); + w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 22)); + w0 = _mm256_srli_epi32(tmp,10); + _mm256_storeu_si256(compressed + 9, w1); + tmp = _mm256_lddqu_si256 (in + 11); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 21)); + w1 = _mm256_srli_epi32(tmp,11); + _mm256_storeu_si256(compressed + 10, w0); + tmp = _mm256_lddqu_si256 (in + 12); + w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 20)); + w0 = _mm256_srli_epi32(tmp,12); + _mm256_storeu_si256(compressed + 11, w1); + tmp = _mm256_lddqu_si256 (in + 13); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 19)); + w1 = _mm256_srli_epi32(tmp,13); + _mm256_storeu_si256(compressed + 12, w0); + tmp = _mm256_lddqu_si256 (in + 14); + w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 18)); + w0 = _mm256_srli_epi32(tmp,14); + _mm256_storeu_si256(compressed + 13, w1); + tmp = _mm256_lddqu_si256 (in + 15); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 17)); + w1 = _mm256_srli_epi32(tmp,15); + _mm256_storeu_si256(compressed + 14, w0); + tmp = _mm256_lddqu_si256 (in + 16); + w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 16)); + w0 = _mm256_srli_epi32(tmp,16); + _mm256_storeu_si256(compressed + 15, w1); + tmp = _mm256_lddqu_si256 (in + 17); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 15)); + w1 = _mm256_srli_epi32(tmp,17); + _mm256_storeu_si256(compressed + 16, w0); + tmp = _mm256_lddqu_si256 (in + 18); + w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 14)); + w0 = _mm256_srli_epi32(tmp,18); + _mm256_storeu_si256(compressed + 17, w1); + tmp = _mm256_lddqu_si256 (in + 19); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 13)); + w1 = _mm256_srli_epi32(tmp,19); + _mm256_storeu_si256(compressed + 18, w0); + tmp = _mm256_lddqu_si256 (in + 20); + w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 12)); + w0 = _mm256_srli_epi32(tmp,20); + _mm256_storeu_si256(compressed + 19, w1); + tmp = _mm256_lddqu_si256 (in + 21); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 11)); + w1 = _mm256_srli_epi32(tmp,21); + _mm256_storeu_si256(compressed + 20, w0); + tmp = _mm256_lddqu_si256 (in + 22); + w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 10)); + w0 = _mm256_srli_epi32(tmp,22); + _mm256_storeu_si256(compressed + 21, w1); + tmp = _mm256_lddqu_si256 (in + 23); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 9)); + w1 = _mm256_srli_epi32(tmp,23); + _mm256_storeu_si256(compressed + 22, w0); + tmp = _mm256_lddqu_si256 (in + 24); + w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 8)); + w0 = _mm256_srli_epi32(tmp,24); + _mm256_storeu_si256(compressed + 23, w1); + tmp = _mm256_lddqu_si256 (in + 25); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 7)); + w1 = _mm256_srli_epi32(tmp,25); + _mm256_storeu_si256(compressed + 24, w0); + tmp = _mm256_lddqu_si256 (in + 26); + w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 6)); + w0 = _mm256_srli_epi32(tmp,26); + _mm256_storeu_si256(compressed + 25, w1); + tmp = _mm256_lddqu_si256 (in + 27); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 5)); + w1 = _mm256_srli_epi32(tmp,27); + _mm256_storeu_si256(compressed + 26, w0); + tmp = _mm256_lddqu_si256 (in + 28); + w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 4)); + w0 = _mm256_srli_epi32(tmp,28); + _mm256_storeu_si256(compressed + 27, w1); + tmp = _mm256_lddqu_si256 (in + 29); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 3)); + w1 = _mm256_srli_epi32(tmp,29); + _mm256_storeu_si256(compressed + 28, w0); + tmp = _mm256_lddqu_si256 (in + 30); + w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 2)); + w0 = _mm256_srli_epi32(tmp,30); + _mm256_storeu_si256(compressed + 29, w1); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 31) , 1)); + _mm256_storeu_si256(compressed + 30, w0); +} + + +/* we are going to pack 256 32-bit values, touching 32 256-bit words, using 512 bytes */ +static void avxpackblock32(const uint32_t * pin, __m256i * compressed) { + const __m256i * in = (const __m256i *) pin; + /* we are going to touch 32 256-bit words */ + __m256i w0, w1; + w0 = _mm256_lddqu_si256 (in + 0); + _mm256_storeu_si256(compressed + 0, w0); + w1 = _mm256_lddqu_si256 (in + 1); + _mm256_storeu_si256(compressed + 1, w1); + w0 = _mm256_lddqu_si256 (in + 2); + _mm256_storeu_si256(compressed + 2, w0); + w1 = _mm256_lddqu_si256 (in + 3); + _mm256_storeu_si256(compressed + 3, w1); + w0 = _mm256_lddqu_si256 (in + 4); + _mm256_storeu_si256(compressed + 4, w0); + w1 = _mm256_lddqu_si256 (in + 5); + _mm256_storeu_si256(compressed + 5, w1); + w0 = _mm256_lddqu_si256 (in + 6); + _mm256_storeu_si256(compressed + 6, w0); + w1 = _mm256_lddqu_si256 (in + 7); + _mm256_storeu_si256(compressed + 7, w1); + w0 = _mm256_lddqu_si256 (in + 8); + _mm256_storeu_si256(compressed + 8, w0); + w1 = _mm256_lddqu_si256 (in + 9); + _mm256_storeu_si256(compressed + 9, w1); + w0 = _mm256_lddqu_si256 (in + 10); + _mm256_storeu_si256(compressed + 10, w0); + w1 = _mm256_lddqu_si256 (in + 11); + _mm256_storeu_si256(compressed + 11, w1); + w0 = _mm256_lddqu_si256 (in + 12); + _mm256_storeu_si256(compressed + 12, w0); + w1 = _mm256_lddqu_si256 (in + 13); + _mm256_storeu_si256(compressed + 13, w1); + w0 = _mm256_lddqu_si256 (in + 14); + _mm256_storeu_si256(compressed + 14, w0); + w1 = _mm256_lddqu_si256 (in + 15); + _mm256_storeu_si256(compressed + 15, w1); + w0 = _mm256_lddqu_si256 (in + 16); + _mm256_storeu_si256(compressed + 16, w0); + w1 = _mm256_lddqu_si256 (in + 17); + _mm256_storeu_si256(compressed + 17, w1); + w0 = _mm256_lddqu_si256 (in + 18); + _mm256_storeu_si256(compressed + 18, w0); + w1 = _mm256_lddqu_si256 (in + 19); + _mm256_storeu_si256(compressed + 19, w1); + w0 = _mm256_lddqu_si256 (in + 20); + _mm256_storeu_si256(compressed + 20, w0); + w1 = _mm256_lddqu_si256 (in + 21); + _mm256_storeu_si256(compressed + 21, w1); + w0 = _mm256_lddqu_si256 (in + 22); + _mm256_storeu_si256(compressed + 22, w0); + w1 = _mm256_lddqu_si256 (in + 23); + _mm256_storeu_si256(compressed + 23, w1); + w0 = _mm256_lddqu_si256 (in + 24); + _mm256_storeu_si256(compressed + 24, w0); + w1 = _mm256_lddqu_si256 (in + 25); + _mm256_storeu_si256(compressed + 25, w1); + w0 = _mm256_lddqu_si256 (in + 26); + _mm256_storeu_si256(compressed + 26, w0); + w1 = _mm256_lddqu_si256 (in + 27); + _mm256_storeu_si256(compressed + 27, w1); + w0 = _mm256_lddqu_si256 (in + 28); + _mm256_storeu_si256(compressed + 28, w0); + w1 = _mm256_lddqu_si256 (in + 29); + _mm256_storeu_si256(compressed + 29, w1); + w0 = _mm256_lddqu_si256 (in + 30); + _mm256_storeu_si256(compressed + 30, w0); + w1 = _mm256_lddqu_si256 (in + 31); + _mm256_storeu_si256(compressed + 31, w1); +} + + +static void avxpackblockmask0(const uint32_t * pin, __m256i * compressed) { + (void)compressed; + (void) pin; /* we consumed 256 32-bit integers */ +} + + +/* we are going to pack 256 1-bit values, touching 1 256-bit words, using 16 bytes */ +static void avxpackblockmask1(const uint32_t * pin, __m256i * compressed) { + /* we are going to touch 1 256-bit word */ + __m256i w0; + const __m256i * in = (const __m256i *) pin; + const __m256i mask = _mm256_set1_epi32(1); + w0 = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 0) ) ; + w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 1) ) , 1)); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 2) ) , 2)); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 3) ) , 3)); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 4) ) , 4)); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 5) ) , 5)); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 6) ) , 6)); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 7) ) , 7)); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 8) ) , 8)); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 9) ) , 9)); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 10) ) , 10)); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 11) ) , 11)); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 12) ) , 12)); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 13) ) , 13)); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 14) ) , 14)); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 15) ) , 15)); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 16) ) , 16)); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 17) ) , 17)); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 18) ) , 18)); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 19) ) , 19)); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 20) ) , 20)); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 21) ) , 21)); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 22) ) , 22)); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 23) ) , 23)); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 24) ) , 24)); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 25) ) , 25)); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 26) ) , 26)); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 27) ) , 27)); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 28) ) , 28)); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 29) ) , 29)); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 30) ) , 30)); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 31) ) , 31)); + _mm256_storeu_si256(compressed + 0, w0); +} + + +/* we are going to pack 256 2-bit values, touching 2 256-bit words, using 32 bytes */ +static void avxpackblockmask2(const uint32_t * pin, __m256i * compressed) { + /* we are going to touch 2 256-bit words */ + __m256i w0, w1; + const __m256i * in = (const __m256i *) pin; + const __m256i mask = _mm256_set1_epi32(3); + w0 = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 0) ) ; + w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 1) ) , 2)); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 2) ) , 4)); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 3) ) , 6)); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 4) ) , 8)); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 5) ) , 10)); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 6) ) , 12)); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 7) ) , 14)); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 8) ) , 16)); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 9) ) , 18)); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 10) ) , 20)); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 11) ) , 22)); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 12) ) , 24)); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 13) ) , 26)); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 14) ) , 28)); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 15) ) , 30)); + _mm256_storeu_si256(compressed + 0, w0); + w1 = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 16) ) ; + w1 = _mm256_or_si256(w1,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 17) ) , 2)); + w1 = _mm256_or_si256(w1,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 18) ) , 4)); + w1 = _mm256_or_si256(w1,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 19) ) , 6)); + w1 = _mm256_or_si256(w1,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 20) ) , 8)); + w1 = _mm256_or_si256(w1,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 21) ) , 10)); + w1 = _mm256_or_si256(w1,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 22) ) , 12)); + w1 = _mm256_or_si256(w1,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 23) ) , 14)); + w1 = _mm256_or_si256(w1,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 24) ) , 16)); + w1 = _mm256_or_si256(w1,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 25) ) , 18)); + w1 = _mm256_or_si256(w1,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 26) ) , 20)); + w1 = _mm256_or_si256(w1,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 27) ) , 22)); + w1 = _mm256_or_si256(w1,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 28) ) , 24)); + w1 = _mm256_or_si256(w1,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 29) ) , 26)); + w1 = _mm256_or_si256(w1,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 30) ) , 28)); + w1 = _mm256_or_si256(w1,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 31) ) , 30)); + _mm256_storeu_si256(compressed + 1, w1); +} + + +/* we are going to pack 256 3-bit values, touching 3 256-bit words, using 48 bytes */ +static void avxpackblockmask3(const uint32_t * pin, __m256i * compressed) { + /* we are going to touch 3 256-bit words */ + __m256i w0, w1; + const __m256i * in = (const __m256i *) pin; + const __m256i mask = _mm256_set1_epi32(7); + __m256i tmp; /* used to store inputs at word boundary */ + w0 = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 0) ) ; + w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 1) ) , 3)); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 2) ) , 6)); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 3) ) , 9)); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 4) ) , 12)); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 5) ) , 15)); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 6) ) , 18)); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 7) ) , 21)); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 8) ) , 24)); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 9) ) , 27)); + tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 10) ) ; + w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 30)); + w1 = _mm256_srli_epi32(tmp,2); + _mm256_storeu_si256(compressed + 0, w0); + w1 = _mm256_or_si256(w1,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 11) ) , 1)); + w1 = _mm256_or_si256(w1,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 12) ) , 4)); + w1 = _mm256_or_si256(w1,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 13) ) , 7)); + w1 = _mm256_or_si256(w1,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 14) ) , 10)); + w1 = _mm256_or_si256(w1,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 15) ) , 13)); + w1 = _mm256_or_si256(w1,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 16) ) , 16)); + w1 = _mm256_or_si256(w1,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 17) ) , 19)); + w1 = _mm256_or_si256(w1,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 18) ) , 22)); + w1 = _mm256_or_si256(w1,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 19) ) , 25)); + w1 = _mm256_or_si256(w1,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 20) ) , 28)); + tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 21) ) ; + w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 31)); + w0 = _mm256_srli_epi32(tmp,1); + _mm256_storeu_si256(compressed + 1, w1); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 22) ) , 2)); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 23) ) , 5)); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 24) ) , 8)); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 25) ) , 11)); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 26) ) , 14)); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 27) ) , 17)); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 28) ) , 20)); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 29) ) , 23)); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 30) ) , 26)); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 31) ) , 29)); + _mm256_storeu_si256(compressed + 2, w0); +} + + +/* we are going to pack 256 4-bit values, touching 4 256-bit words, using 64 bytes */ +static void avxpackblockmask4(const uint32_t * pin, __m256i * compressed) { + /* we are going to touch 4 256-bit words */ + __m256i w0, w1; + const __m256i * in = (const __m256i *) pin; + const __m256i mask = _mm256_set1_epi32(15); + w0 = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 0) ) ; + w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 1) ) , 4)); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 2) ) , 8)); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 3) ) , 12)); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 4) ) , 16)); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 5) ) , 20)); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 6) ) , 24)); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 7) ) , 28)); + _mm256_storeu_si256(compressed + 0, w0); + w1 = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 8) ) ; + w1 = _mm256_or_si256(w1,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 9) ) , 4)); + w1 = _mm256_or_si256(w1,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 10) ) , 8)); + w1 = _mm256_or_si256(w1,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 11) ) , 12)); + w1 = _mm256_or_si256(w1,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 12) ) , 16)); + w1 = _mm256_or_si256(w1,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 13) ) , 20)); + w1 = _mm256_or_si256(w1,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 14) ) , 24)); + w1 = _mm256_or_si256(w1,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 15) ) , 28)); + _mm256_storeu_si256(compressed + 1, w1); + w0 = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 16) ) ; + w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 17) ) , 4)); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 18) ) , 8)); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 19) ) , 12)); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 20) ) , 16)); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 21) ) , 20)); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 22) ) , 24)); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 23) ) , 28)); + _mm256_storeu_si256(compressed + 2, w0); + w1 = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 24) ) ; + w1 = _mm256_or_si256(w1,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 25) ) , 4)); + w1 = _mm256_or_si256(w1,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 26) ) , 8)); + w1 = _mm256_or_si256(w1,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 27) ) , 12)); + w1 = _mm256_or_si256(w1,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 28) ) , 16)); + w1 = _mm256_or_si256(w1,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 29) ) , 20)); + w1 = _mm256_or_si256(w1,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 30) ) , 24)); + w1 = _mm256_or_si256(w1,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 31) ) , 28)); + _mm256_storeu_si256(compressed + 3, w1); +} + + +/* we are going to pack 256 5-bit values, touching 5 256-bit words, using 80 bytes */ +static void avxpackblockmask5(const uint32_t * pin, __m256i * compressed) { + /* we are going to touch 5 256-bit words */ + __m256i w0, w1; + const __m256i * in = (const __m256i *) pin; + const __m256i mask = _mm256_set1_epi32(31); + __m256i tmp; /* used to store inputs at word boundary */ + w0 = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 0) ) ; + w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 1) ) , 5)); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 2) ) , 10)); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 3) ) , 15)); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 4) ) , 20)); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 5) ) , 25)); + tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 6) ) ; + w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 30)); + w1 = _mm256_srli_epi32(tmp,2); + _mm256_storeu_si256(compressed + 0, w0); + w1 = _mm256_or_si256(w1,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 7) ) , 3)); + w1 = _mm256_or_si256(w1,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 8) ) , 8)); + w1 = _mm256_or_si256(w1,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 9) ) , 13)); + w1 = _mm256_or_si256(w1,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 10) ) , 18)); + w1 = _mm256_or_si256(w1,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 11) ) , 23)); + tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 12) ) ; + w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 28)); + w0 = _mm256_srli_epi32(tmp,4); + _mm256_storeu_si256(compressed + 1, w1); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 13) ) , 1)); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 14) ) , 6)); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 15) ) , 11)); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 16) ) , 16)); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 17) ) , 21)); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 18) ) , 26)); + tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 19) ) ; + w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 31)); + w1 = _mm256_srli_epi32(tmp,1); + _mm256_storeu_si256(compressed + 2, w0); + w1 = _mm256_or_si256(w1,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 20) ) , 4)); + w1 = _mm256_or_si256(w1,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 21) ) , 9)); + w1 = _mm256_or_si256(w1,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 22) ) , 14)); + w1 = _mm256_or_si256(w1,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 23) ) , 19)); + w1 = _mm256_or_si256(w1,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 24) ) , 24)); + tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 25) ) ; + w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 29)); + w0 = _mm256_srli_epi32(tmp,3); + _mm256_storeu_si256(compressed + 3, w1); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 26) ) , 2)); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 27) ) , 7)); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 28) ) , 12)); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 29) ) , 17)); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 30) ) , 22)); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 31) ) , 27)); + _mm256_storeu_si256(compressed + 4, w0); +} + + +/* we are going to pack 256 6-bit values, touching 6 256-bit words, using 96 bytes */ +static void avxpackblockmask6(const uint32_t * pin, __m256i * compressed) { + /* we are going to touch 6 256-bit words */ + __m256i w0, w1; + const __m256i * in = (const __m256i *) pin; + const __m256i mask = _mm256_set1_epi32(63); + __m256i tmp; /* used to store inputs at word boundary */ + w0 = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 0) ) ; + w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 1) ) , 6)); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 2) ) , 12)); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 3) ) , 18)); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 4) ) , 24)); + tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 5) ) ; + w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 30)); + w1 = _mm256_srli_epi32(tmp,2); + _mm256_storeu_si256(compressed + 0, w0); + w1 = _mm256_or_si256(w1,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 6) ) , 4)); + w1 = _mm256_or_si256(w1,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 7) ) , 10)); + w1 = _mm256_or_si256(w1,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 8) ) , 16)); + w1 = _mm256_or_si256(w1,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 9) ) , 22)); + tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 10) ) ; + w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 28)); + w0 = _mm256_srli_epi32(tmp,4); + _mm256_storeu_si256(compressed + 1, w1); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 11) ) , 2)); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 12) ) , 8)); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 13) ) , 14)); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 14) ) , 20)); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 15) ) , 26)); + _mm256_storeu_si256(compressed + 2, w0); + w1 = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 16) ) ; + w1 = _mm256_or_si256(w1,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 17) ) , 6)); + w1 = _mm256_or_si256(w1,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 18) ) , 12)); + w1 = _mm256_or_si256(w1,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 19) ) , 18)); + w1 = _mm256_or_si256(w1,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 20) ) , 24)); + tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 21) ) ; + w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 30)); + w0 = _mm256_srli_epi32(tmp,2); + _mm256_storeu_si256(compressed + 3, w1); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 22) ) , 4)); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 23) ) , 10)); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 24) ) , 16)); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 25) ) , 22)); + tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 26) ) ; + w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 28)); + w1 = _mm256_srli_epi32(tmp,4); + _mm256_storeu_si256(compressed + 4, w0); + w1 = _mm256_or_si256(w1,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 27) ) , 2)); + w1 = _mm256_or_si256(w1,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 28) ) , 8)); + w1 = _mm256_or_si256(w1,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 29) ) , 14)); + w1 = _mm256_or_si256(w1,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 30) ) , 20)); + w1 = _mm256_or_si256(w1,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 31) ) , 26)); + _mm256_storeu_si256(compressed + 5, w1); +} + + +/* we are going to pack 256 7-bit values, touching 7 256-bit words, using 112 bytes */ +static void avxpackblockmask7(const uint32_t * pin, __m256i * compressed) { + /* we are going to touch 7 256-bit words */ + __m256i w0, w1; + const __m256i * in = (const __m256i *) pin; + const __m256i mask = _mm256_set1_epi32(127); + __m256i tmp; /* used to store inputs at word boundary */ + w0 = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 0) ) ; + w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 1) ) , 7)); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 2) ) , 14)); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 3) ) , 21)); + tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 4) ) ; + w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 28)); + w1 = _mm256_srli_epi32(tmp,4); + _mm256_storeu_si256(compressed + 0, w0); + w1 = _mm256_or_si256(w1,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 5) ) , 3)); + w1 = _mm256_or_si256(w1,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 6) ) , 10)); + w1 = _mm256_or_si256(w1,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 7) ) , 17)); + w1 = _mm256_or_si256(w1,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 8) ) , 24)); + tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 9) ) ; + w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 31)); + w0 = _mm256_srli_epi32(tmp,1); + _mm256_storeu_si256(compressed + 1, w1); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 10) ) , 6)); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 11) ) , 13)); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 12) ) , 20)); + tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 13) ) ; + w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 27)); + w1 = _mm256_srli_epi32(tmp,5); + _mm256_storeu_si256(compressed + 2, w0); + w1 = _mm256_or_si256(w1,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 14) ) , 2)); + w1 = _mm256_or_si256(w1,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 15) ) , 9)); + w1 = _mm256_or_si256(w1,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 16) ) , 16)); + w1 = _mm256_or_si256(w1,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 17) ) , 23)); + tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 18) ) ; + w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 30)); + w0 = _mm256_srli_epi32(tmp,2); + _mm256_storeu_si256(compressed + 3, w1); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 19) ) , 5)); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 20) ) , 12)); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 21) ) , 19)); + tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 22) ) ; + w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 26)); + w1 = _mm256_srli_epi32(tmp,6); + _mm256_storeu_si256(compressed + 4, w0); + w1 = _mm256_or_si256(w1,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 23) ) , 1)); + w1 = _mm256_or_si256(w1,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 24) ) , 8)); + w1 = _mm256_or_si256(w1,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 25) ) , 15)); + w1 = _mm256_or_si256(w1,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 26) ) , 22)); + tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 27) ) ; + w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 29)); + w0 = _mm256_srli_epi32(tmp,3); + _mm256_storeu_si256(compressed + 5, w1); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 28) ) , 4)); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 29) ) , 11)); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 30) ) , 18)); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 31) ) , 25)); + _mm256_storeu_si256(compressed + 6, w0); +} + + +/* we are going to pack 256 8-bit values, touching 8 256-bit words, using 128 bytes */ +static void avxpackblockmask8(const uint32_t * pin, __m256i * compressed) { + /* we are going to touch 8 256-bit words */ + __m256i w0, w1; + const __m256i * in = (const __m256i *) pin; + const __m256i mask = _mm256_set1_epi32(255); + w0 = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 0) ) ; + w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 1) ) , 8)); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 2) ) , 16)); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 3) ) , 24)); + _mm256_storeu_si256(compressed + 0, w0); + w1 = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 4) ) ; + w1 = _mm256_or_si256(w1,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 5) ) , 8)); + w1 = _mm256_or_si256(w1,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 6) ) , 16)); + w1 = _mm256_or_si256(w1,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 7) ) , 24)); + _mm256_storeu_si256(compressed + 1, w1); + w0 = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 8) ) ; + w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 9) ) , 8)); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 10) ) , 16)); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 11) ) , 24)); + _mm256_storeu_si256(compressed + 2, w0); + w1 = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 12) ) ; + w1 = _mm256_or_si256(w1,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 13) ) , 8)); + w1 = _mm256_or_si256(w1,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 14) ) , 16)); + w1 = _mm256_or_si256(w1,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 15) ) , 24)); + _mm256_storeu_si256(compressed + 3, w1); + w0 = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 16) ) ; + w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 17) ) , 8)); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 18) ) , 16)); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 19) ) , 24)); + _mm256_storeu_si256(compressed + 4, w0); + w1 = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 20) ) ; + w1 = _mm256_or_si256(w1,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 21) ) , 8)); + w1 = _mm256_or_si256(w1,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 22) ) , 16)); + w1 = _mm256_or_si256(w1,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 23) ) , 24)); + _mm256_storeu_si256(compressed + 5, w1); + w0 = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 24) ) ; + w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 25) ) , 8)); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 26) ) , 16)); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 27) ) , 24)); + _mm256_storeu_si256(compressed + 6, w0); + w1 = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 28) ) ; + w1 = _mm256_or_si256(w1,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 29) ) , 8)); + w1 = _mm256_or_si256(w1,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 30) ) , 16)); + w1 = _mm256_or_si256(w1,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 31) ) , 24)); + _mm256_storeu_si256(compressed + 7, w1); +} + + +/* we are going to pack 256 9-bit values, touching 9 256-bit words, using 144 bytes */ +static void avxpackblockmask9(const uint32_t * pin, __m256i * compressed) { + /* we are going to touch 9 256-bit words */ + __m256i w0, w1; + const __m256i * in = (const __m256i *) pin; + const __m256i mask = _mm256_set1_epi32(511); + __m256i tmp; /* used to store inputs at word boundary */ + w0 = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 0) ) ; + w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 1) ) , 9)); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 2) ) , 18)); + tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 3) ) ; + w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 27)); + w1 = _mm256_srli_epi32(tmp,5); + _mm256_storeu_si256(compressed + 0, w0); + w1 = _mm256_or_si256(w1,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 4) ) , 4)); + w1 = _mm256_or_si256(w1,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 5) ) , 13)); + w1 = _mm256_or_si256(w1,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 6) ) , 22)); + tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 7) ) ; + w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 31)); + w0 = _mm256_srli_epi32(tmp,1); + _mm256_storeu_si256(compressed + 1, w1); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 8) ) , 8)); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 9) ) , 17)); + tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 10) ) ; + w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 26)); + w1 = _mm256_srli_epi32(tmp,6); + _mm256_storeu_si256(compressed + 2, w0); + w1 = _mm256_or_si256(w1,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 11) ) , 3)); + w1 = _mm256_or_si256(w1,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 12) ) , 12)); + w1 = _mm256_or_si256(w1,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 13) ) , 21)); + tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 14) ) ; + w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 30)); + w0 = _mm256_srli_epi32(tmp,2); + _mm256_storeu_si256(compressed + 3, w1); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 15) ) , 7)); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 16) ) , 16)); + tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 17) ) ; + w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 25)); + w1 = _mm256_srli_epi32(tmp,7); + _mm256_storeu_si256(compressed + 4, w0); + w1 = _mm256_or_si256(w1,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 18) ) , 2)); + w1 = _mm256_or_si256(w1,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 19) ) , 11)); + w1 = _mm256_or_si256(w1,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 20) ) , 20)); + tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 21) ) ; + w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 29)); + w0 = _mm256_srli_epi32(tmp,3); + _mm256_storeu_si256(compressed + 5, w1); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 22) ) , 6)); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 23) ) , 15)); + tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 24) ) ; + w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 24)); + w1 = _mm256_srli_epi32(tmp,8); + _mm256_storeu_si256(compressed + 6, w0); + w1 = _mm256_or_si256(w1,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 25) ) , 1)); + w1 = _mm256_or_si256(w1,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 26) ) , 10)); + w1 = _mm256_or_si256(w1,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 27) ) , 19)); + tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 28) ) ; + w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 28)); + w0 = _mm256_srli_epi32(tmp,4); + _mm256_storeu_si256(compressed + 7, w1); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 29) ) , 5)); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 30) ) , 14)); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 31) ) , 23)); + _mm256_storeu_si256(compressed + 8, w0); +} + + +/* we are going to pack 256 10-bit values, touching 10 256-bit words, using 160 bytes */ +static void avxpackblockmask10(const uint32_t * pin, __m256i * compressed) { + /* we are going to touch 10 256-bit words */ + __m256i w0, w1; + const __m256i * in = (const __m256i *) pin; + const __m256i mask = _mm256_set1_epi32(1023); + __m256i tmp; /* used to store inputs at word boundary */ + w0 = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 0) ) ; + w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 1) ) , 10)); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 2) ) , 20)); + tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 3) ) ; + w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 30)); + w1 = _mm256_srli_epi32(tmp,2); + _mm256_storeu_si256(compressed + 0, w0); + w1 = _mm256_or_si256(w1,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 4) ) , 8)); + w1 = _mm256_or_si256(w1,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 5) ) , 18)); + tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 6) ) ; + w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 28)); + w0 = _mm256_srli_epi32(tmp,4); + _mm256_storeu_si256(compressed + 1, w1); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 7) ) , 6)); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 8) ) , 16)); + tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 9) ) ; + w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 26)); + w1 = _mm256_srli_epi32(tmp,6); + _mm256_storeu_si256(compressed + 2, w0); + w1 = _mm256_or_si256(w1,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 10) ) , 4)); + w1 = _mm256_or_si256(w1,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 11) ) , 14)); + tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 12) ) ; + w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 24)); + w0 = _mm256_srli_epi32(tmp,8); + _mm256_storeu_si256(compressed + 3, w1); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 13) ) , 2)); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 14) ) , 12)); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 15) ) , 22)); + _mm256_storeu_si256(compressed + 4, w0); + w1 = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 16) ) ; + w1 = _mm256_or_si256(w1,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 17) ) , 10)); + w1 = _mm256_or_si256(w1,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 18) ) , 20)); + tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 19) ) ; + w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 30)); + w0 = _mm256_srli_epi32(tmp,2); + _mm256_storeu_si256(compressed + 5, w1); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 20) ) , 8)); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 21) ) , 18)); + tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 22) ) ; + w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 28)); + w1 = _mm256_srli_epi32(tmp,4); + _mm256_storeu_si256(compressed + 6, w0); + w1 = _mm256_or_si256(w1,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 23) ) , 6)); + w1 = _mm256_or_si256(w1,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 24) ) , 16)); + tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 25) ) ; + w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 26)); + w0 = _mm256_srli_epi32(tmp,6); + _mm256_storeu_si256(compressed + 7, w1); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 26) ) , 4)); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 27) ) , 14)); + tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 28) ) ; + w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 24)); + w1 = _mm256_srli_epi32(tmp,8); + _mm256_storeu_si256(compressed + 8, w0); + w1 = _mm256_or_si256(w1,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 29) ) , 2)); + w1 = _mm256_or_si256(w1,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 30) ) , 12)); + w1 = _mm256_or_si256(w1,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 31) ) , 22)); + _mm256_storeu_si256(compressed + 9, w1); +} + + +/* we are going to pack 256 11-bit values, touching 11 256-bit words, using 176 bytes */ +static void avxpackblockmask11(const uint32_t * pin, __m256i * compressed) { + /* we are going to touch 11 256-bit words */ + __m256i w0, w1; + const __m256i * in = (const __m256i *) pin; + const __m256i mask = _mm256_set1_epi32(2047); + __m256i tmp; /* used to store inputs at word boundary */ + w0 = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 0) ) ; + w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 1) ) , 11)); + tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 2) ) ; + w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 22)); + w1 = _mm256_srli_epi32(tmp,10); + _mm256_storeu_si256(compressed + 0, w0); + w1 = _mm256_or_si256(w1,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 3) ) , 1)); + w1 = _mm256_or_si256(w1,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 4) ) , 12)); + tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 5) ) ; + w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 23)); + w0 = _mm256_srli_epi32(tmp,9); + _mm256_storeu_si256(compressed + 1, w1); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 6) ) , 2)); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 7) ) , 13)); + tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 8) ) ; + w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 24)); + w1 = _mm256_srli_epi32(tmp,8); + _mm256_storeu_si256(compressed + 2, w0); + w1 = _mm256_or_si256(w1,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 9) ) , 3)); + w1 = _mm256_or_si256(w1,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 10) ) , 14)); + tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 11) ) ; + w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 25)); + w0 = _mm256_srli_epi32(tmp,7); + _mm256_storeu_si256(compressed + 3, w1); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 12) ) , 4)); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 13) ) , 15)); + tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 14) ) ; + w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 26)); + w1 = _mm256_srli_epi32(tmp,6); + _mm256_storeu_si256(compressed + 4, w0); + w1 = _mm256_or_si256(w1,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 15) ) , 5)); + w1 = _mm256_or_si256(w1,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 16) ) , 16)); + tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 17) ) ; + w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 27)); + w0 = _mm256_srli_epi32(tmp,5); + _mm256_storeu_si256(compressed + 5, w1); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 18) ) , 6)); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 19) ) , 17)); + tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 20) ) ; + w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 28)); + w1 = _mm256_srli_epi32(tmp,4); + _mm256_storeu_si256(compressed + 6, w0); + w1 = _mm256_or_si256(w1,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 21) ) , 7)); + w1 = _mm256_or_si256(w1,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 22) ) , 18)); + tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 23) ) ; + w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 29)); + w0 = _mm256_srli_epi32(tmp,3); + _mm256_storeu_si256(compressed + 7, w1); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 24) ) , 8)); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 25) ) , 19)); + tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 26) ) ; + w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 30)); + w1 = _mm256_srli_epi32(tmp,2); + _mm256_storeu_si256(compressed + 8, w0); + w1 = _mm256_or_si256(w1,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 27) ) , 9)); + w1 = _mm256_or_si256(w1,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 28) ) , 20)); + tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 29) ) ; + w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 31)); + w0 = _mm256_srli_epi32(tmp,1); + _mm256_storeu_si256(compressed + 9, w1); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 30) ) , 10)); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 31) ) , 21)); + _mm256_storeu_si256(compressed + 10, w0); +} + + +/* we are going to pack 256 12-bit values, touching 12 256-bit words, using 192 bytes */ +static void avxpackblockmask12(const uint32_t * pin, __m256i * compressed) { + /* we are going to touch 12 256-bit words */ + __m256i w0, w1; + const __m256i * in = (const __m256i *) pin; + const __m256i mask = _mm256_set1_epi32(4095); + __m256i tmp; /* used to store inputs at word boundary */ + w0 = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 0) ) ; + w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 1) ) , 12)); + tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 2) ) ; + w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 24)); + w1 = _mm256_srli_epi32(tmp,8); + _mm256_storeu_si256(compressed + 0, w0); + w1 = _mm256_or_si256(w1,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 3) ) , 4)); + w1 = _mm256_or_si256(w1,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 4) ) , 16)); + tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 5) ) ; + w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 28)); + w0 = _mm256_srli_epi32(tmp,4); + _mm256_storeu_si256(compressed + 1, w1); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 6) ) , 8)); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 7) ) , 20)); + _mm256_storeu_si256(compressed + 2, w0); + w1 = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 8) ) ; + w1 = _mm256_or_si256(w1,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 9) ) , 12)); + tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 10) ) ; + w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 24)); + w0 = _mm256_srli_epi32(tmp,8); + _mm256_storeu_si256(compressed + 3, w1); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 11) ) , 4)); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 12) ) , 16)); + tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 13) ) ; + w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 28)); + w1 = _mm256_srli_epi32(tmp,4); + _mm256_storeu_si256(compressed + 4, w0); + w1 = _mm256_or_si256(w1,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 14) ) , 8)); + w1 = _mm256_or_si256(w1,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 15) ) , 20)); + _mm256_storeu_si256(compressed + 5, w1); + w0 = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 16) ) ; + w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 17) ) , 12)); + tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 18) ) ; + w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 24)); + w1 = _mm256_srli_epi32(tmp,8); + _mm256_storeu_si256(compressed + 6, w0); + w1 = _mm256_or_si256(w1,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 19) ) , 4)); + w1 = _mm256_or_si256(w1,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 20) ) , 16)); + tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 21) ) ; + w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 28)); + w0 = _mm256_srli_epi32(tmp,4); + _mm256_storeu_si256(compressed + 7, w1); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 22) ) , 8)); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 23) ) , 20)); + _mm256_storeu_si256(compressed + 8, w0); + w1 = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 24) ) ; + w1 = _mm256_or_si256(w1,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 25) ) , 12)); + tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 26) ) ; + w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 24)); + w0 = _mm256_srli_epi32(tmp,8); + _mm256_storeu_si256(compressed + 9, w1); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 27) ) , 4)); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 28) ) , 16)); + tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 29) ) ; + w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 28)); + w1 = _mm256_srli_epi32(tmp,4); + _mm256_storeu_si256(compressed + 10, w0); + w1 = _mm256_or_si256(w1,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 30) ) , 8)); + w1 = _mm256_or_si256(w1,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 31) ) , 20)); + _mm256_storeu_si256(compressed + 11, w1); +} + + +/* we are going to pack 256 13-bit values, touching 13 256-bit words, using 208 bytes */ +static void avxpackblockmask13(const uint32_t * pin, __m256i * compressed) { + /* we are going to touch 13 256-bit words */ + __m256i w0, w1; + const __m256i * in = (const __m256i *) pin; + const __m256i mask = _mm256_set1_epi32(8191); + __m256i tmp; /* used to store inputs at word boundary */ + w0 = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 0) ) ; + w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 1) ) , 13)); + tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 2) ) ; + w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 26)); + w1 = _mm256_srli_epi32(tmp,6); + _mm256_storeu_si256(compressed + 0, w0); + w1 = _mm256_or_si256(w1,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 3) ) , 7)); + tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 4) ) ; + w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 20)); + w0 = _mm256_srli_epi32(tmp,12); + _mm256_storeu_si256(compressed + 1, w1); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 5) ) , 1)); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 6) ) , 14)); + tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 7) ) ; + w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 27)); + w1 = _mm256_srli_epi32(tmp,5); + _mm256_storeu_si256(compressed + 2, w0); + w1 = _mm256_or_si256(w1,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 8) ) , 8)); + tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 9) ) ; + w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 21)); + w0 = _mm256_srli_epi32(tmp,11); + _mm256_storeu_si256(compressed + 3, w1); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 10) ) , 2)); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 11) ) , 15)); + tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 12) ) ; + w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 28)); + w1 = _mm256_srli_epi32(tmp,4); + _mm256_storeu_si256(compressed + 4, w0); + w1 = _mm256_or_si256(w1,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 13) ) , 9)); + tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 14) ) ; + w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 22)); + w0 = _mm256_srli_epi32(tmp,10); + _mm256_storeu_si256(compressed + 5, w1); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 15) ) , 3)); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 16) ) , 16)); + tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 17) ) ; + w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 29)); + w1 = _mm256_srli_epi32(tmp,3); + _mm256_storeu_si256(compressed + 6, w0); + w1 = _mm256_or_si256(w1,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 18) ) , 10)); + tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 19) ) ; + w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 23)); + w0 = _mm256_srli_epi32(tmp,9); + _mm256_storeu_si256(compressed + 7, w1); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 20) ) , 4)); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 21) ) , 17)); + tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 22) ) ; + w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 30)); + w1 = _mm256_srli_epi32(tmp,2); + _mm256_storeu_si256(compressed + 8, w0); + w1 = _mm256_or_si256(w1,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 23) ) , 11)); + tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 24) ) ; + w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 24)); + w0 = _mm256_srli_epi32(tmp,8); + _mm256_storeu_si256(compressed + 9, w1); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 25) ) , 5)); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 26) ) , 18)); + tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 27) ) ; + w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 31)); + w1 = _mm256_srli_epi32(tmp,1); + _mm256_storeu_si256(compressed + 10, w0); + w1 = _mm256_or_si256(w1,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 28) ) , 12)); + tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 29) ) ; + w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 25)); + w0 = _mm256_srli_epi32(tmp,7); + _mm256_storeu_si256(compressed + 11, w1); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 30) ) , 6)); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 31) ) , 19)); + _mm256_storeu_si256(compressed + 12, w0); +} + + +/* we are going to pack 256 14-bit values, touching 14 256-bit words, using 224 bytes */ +static void avxpackblockmask14(const uint32_t * pin, __m256i * compressed) { + /* we are going to touch 14 256-bit words */ + __m256i w0, w1; + const __m256i * in = (const __m256i *) pin; + const __m256i mask = _mm256_set1_epi32(16383); + __m256i tmp; /* used to store inputs at word boundary */ + w0 = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 0) ) ; + w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 1) ) , 14)); + tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 2) ) ; + w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 28)); + w1 = _mm256_srli_epi32(tmp,4); + _mm256_storeu_si256(compressed + 0, w0); + w1 = _mm256_or_si256(w1,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 3) ) , 10)); + tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 4) ) ; + w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 24)); + w0 = _mm256_srli_epi32(tmp,8); + _mm256_storeu_si256(compressed + 1, w1); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 5) ) , 6)); + tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 6) ) ; + w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 20)); + w1 = _mm256_srli_epi32(tmp,12); + _mm256_storeu_si256(compressed + 2, w0); + w1 = _mm256_or_si256(w1,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 7) ) , 2)); + w1 = _mm256_or_si256(w1,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 8) ) , 16)); + tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 9) ) ; + w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 30)); + w0 = _mm256_srli_epi32(tmp,2); + _mm256_storeu_si256(compressed + 3, w1); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 10) ) , 12)); + tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 11) ) ; + w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 26)); + w1 = _mm256_srli_epi32(tmp,6); + _mm256_storeu_si256(compressed + 4, w0); + w1 = _mm256_or_si256(w1,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 12) ) , 8)); + tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 13) ) ; + w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 22)); + w0 = _mm256_srli_epi32(tmp,10); + _mm256_storeu_si256(compressed + 5, w1); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 14) ) , 4)); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 15) ) , 18)); + _mm256_storeu_si256(compressed + 6, w0); + w1 = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 16) ) ; + w1 = _mm256_or_si256(w1,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 17) ) , 14)); + tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 18) ) ; + w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 28)); + w0 = _mm256_srli_epi32(tmp,4); + _mm256_storeu_si256(compressed + 7, w1); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 19) ) , 10)); + tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 20) ) ; + w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 24)); + w1 = _mm256_srli_epi32(tmp,8); + _mm256_storeu_si256(compressed + 8, w0); + w1 = _mm256_or_si256(w1,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 21) ) , 6)); + tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 22) ) ; + w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 20)); + w0 = _mm256_srli_epi32(tmp,12); + _mm256_storeu_si256(compressed + 9, w1); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 23) ) , 2)); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 24) ) , 16)); + tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 25) ) ; + w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 30)); + w1 = _mm256_srli_epi32(tmp,2); + _mm256_storeu_si256(compressed + 10, w0); + w1 = _mm256_or_si256(w1,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 26) ) , 12)); + tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 27) ) ; + w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 26)); + w0 = _mm256_srli_epi32(tmp,6); + _mm256_storeu_si256(compressed + 11, w1); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 28) ) , 8)); + tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 29) ) ; + w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 22)); + w1 = _mm256_srli_epi32(tmp,10); + _mm256_storeu_si256(compressed + 12, w0); + w1 = _mm256_or_si256(w1,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 30) ) , 4)); + w1 = _mm256_or_si256(w1,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 31) ) , 18)); + _mm256_storeu_si256(compressed + 13, w1); +} + + +/* we are going to pack 256 15-bit values, touching 15 256-bit words, using 240 bytes */ +static void avxpackblockmask15(const uint32_t * pin, __m256i * compressed) { + /* we are going to touch 15 256-bit words */ + __m256i w0, w1; + const __m256i * in = (const __m256i *) pin; + const __m256i mask = _mm256_set1_epi32(32767); + __m256i tmp; /* used to store inputs at word boundary */ + w0 = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 0) ) ; + w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 1) ) , 15)); + tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 2) ) ; + w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 30)); + w1 = _mm256_srli_epi32(tmp,2); + _mm256_storeu_si256(compressed + 0, w0); + w1 = _mm256_or_si256(w1,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 3) ) , 13)); + tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 4) ) ; + w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 28)); + w0 = _mm256_srli_epi32(tmp,4); + _mm256_storeu_si256(compressed + 1, w1); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 5) ) , 11)); + tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 6) ) ; + w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 26)); + w1 = _mm256_srli_epi32(tmp,6); + _mm256_storeu_si256(compressed + 2, w0); + w1 = _mm256_or_si256(w1,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 7) ) , 9)); + tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 8) ) ; + w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 24)); + w0 = _mm256_srli_epi32(tmp,8); + _mm256_storeu_si256(compressed + 3, w1); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 9) ) , 7)); + tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 10) ) ; + w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 22)); + w1 = _mm256_srli_epi32(tmp,10); + _mm256_storeu_si256(compressed + 4, w0); + w1 = _mm256_or_si256(w1,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 11) ) , 5)); + tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 12) ) ; + w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 20)); + w0 = _mm256_srli_epi32(tmp,12); + _mm256_storeu_si256(compressed + 5, w1); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 13) ) , 3)); + tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 14) ) ; + w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 18)); + w1 = _mm256_srli_epi32(tmp,14); + _mm256_storeu_si256(compressed + 6, w0); + w1 = _mm256_or_si256(w1,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 15) ) , 1)); + w1 = _mm256_or_si256(w1,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 16) ) , 16)); + tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 17) ) ; + w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 31)); + w0 = _mm256_srli_epi32(tmp,1); + _mm256_storeu_si256(compressed + 7, w1); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 18) ) , 14)); + tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 19) ) ; + w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 29)); + w1 = _mm256_srli_epi32(tmp,3); + _mm256_storeu_si256(compressed + 8, w0); + w1 = _mm256_or_si256(w1,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 20) ) , 12)); + tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 21) ) ; + w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 27)); + w0 = _mm256_srli_epi32(tmp,5); + _mm256_storeu_si256(compressed + 9, w1); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 22) ) , 10)); + tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 23) ) ; + w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 25)); + w1 = _mm256_srli_epi32(tmp,7); + _mm256_storeu_si256(compressed + 10, w0); + w1 = _mm256_or_si256(w1,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 24) ) , 8)); + tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 25) ) ; + w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 23)); + w0 = _mm256_srli_epi32(tmp,9); + _mm256_storeu_si256(compressed + 11, w1); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 26) ) , 6)); + tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 27) ) ; + w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 21)); + w1 = _mm256_srli_epi32(tmp,11); + _mm256_storeu_si256(compressed + 12, w0); + w1 = _mm256_or_si256(w1,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 28) ) , 4)); + tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 29) ) ; + w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 19)); + w0 = _mm256_srli_epi32(tmp,13); + _mm256_storeu_si256(compressed + 13, w1); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 30) ) , 2)); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 31) ) , 17)); + _mm256_storeu_si256(compressed + 14, w0); +} + + +/* we are going to pack 256 16-bit values, touching 16 256-bit words, using 256 bytes */ +static void avxpackblockmask16(const uint32_t * pin, __m256i * compressed) { + /* we are going to touch 16 256-bit words */ + __m256i w0, w1; + const __m256i * in = (const __m256i *) pin; + const __m256i mask = _mm256_set1_epi32(65535); + w0 = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 0) ) ; + w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 1) ) , 16)); + _mm256_storeu_si256(compressed + 0, w0); + w1 = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 2) ) ; + w1 = _mm256_or_si256(w1,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 3) ) , 16)); + _mm256_storeu_si256(compressed + 1, w1); + w0 = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 4) ) ; + w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 5) ) , 16)); + _mm256_storeu_si256(compressed + 2, w0); + w1 = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 6) ) ; + w1 = _mm256_or_si256(w1,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 7) ) , 16)); + _mm256_storeu_si256(compressed + 3, w1); + w0 = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 8) ) ; + w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 9) ) , 16)); + _mm256_storeu_si256(compressed + 4, w0); + w1 = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 10) ) ; + w1 = _mm256_or_si256(w1,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 11) ) , 16)); + _mm256_storeu_si256(compressed + 5, w1); + w0 = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 12) ) ; + w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 13) ) , 16)); + _mm256_storeu_si256(compressed + 6, w0); + w1 = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 14) ) ; + w1 = _mm256_or_si256(w1,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 15) ) , 16)); + _mm256_storeu_si256(compressed + 7, w1); + w0 = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 16) ) ; + w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 17) ) , 16)); + _mm256_storeu_si256(compressed + 8, w0); + w1 = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 18) ) ; + w1 = _mm256_or_si256(w1,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 19) ) , 16)); + _mm256_storeu_si256(compressed + 9, w1); + w0 = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 20) ) ; + w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 21) ) , 16)); + _mm256_storeu_si256(compressed + 10, w0); + w1 = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 22) ) ; + w1 = _mm256_or_si256(w1,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 23) ) , 16)); + _mm256_storeu_si256(compressed + 11, w1); + w0 = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 24) ) ; + w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 25) ) , 16)); + _mm256_storeu_si256(compressed + 12, w0); + w1 = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 26) ) ; + w1 = _mm256_or_si256(w1,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 27) ) , 16)); + _mm256_storeu_si256(compressed + 13, w1); + w0 = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 28) ) ; + w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 29) ) , 16)); + _mm256_storeu_si256(compressed + 14, w0); + w1 = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 30) ) ; + w1 = _mm256_or_si256(w1,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 31) ) , 16)); + _mm256_storeu_si256(compressed + 15, w1); +} + + +/* we are going to pack 256 17-bit values, touching 17 256-bit words, using 272 bytes */ +static void avxpackblockmask17(const uint32_t * pin, __m256i * compressed) { + /* we are going to touch 17 256-bit words */ + __m256i w0, w1; + const __m256i * in = (const __m256i *) pin; + const __m256i mask = _mm256_set1_epi32(131071); + __m256i tmp; /* used to store inputs at word boundary */ + w0 = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 0) ) ; + tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 1) ) ; + w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 17)); + w1 = _mm256_srli_epi32(tmp,15); + _mm256_storeu_si256(compressed + 0, w0); + w1 = _mm256_or_si256(w1,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 2) ) , 2)); + tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 3) ) ; + w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 19)); + w0 = _mm256_srli_epi32(tmp,13); + _mm256_storeu_si256(compressed + 1, w1); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 4) ) , 4)); + tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 5) ) ; + w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 21)); + w1 = _mm256_srli_epi32(tmp,11); + _mm256_storeu_si256(compressed + 2, w0); + w1 = _mm256_or_si256(w1,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 6) ) , 6)); + tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 7) ) ; + w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 23)); + w0 = _mm256_srli_epi32(tmp,9); + _mm256_storeu_si256(compressed + 3, w1); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 8) ) , 8)); + tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 9) ) ; + w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 25)); + w1 = _mm256_srli_epi32(tmp,7); + _mm256_storeu_si256(compressed + 4, w0); + w1 = _mm256_or_si256(w1,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 10) ) , 10)); + tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 11) ) ; + w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 27)); + w0 = _mm256_srli_epi32(tmp,5); + _mm256_storeu_si256(compressed + 5, w1); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 12) ) , 12)); + tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 13) ) ; + w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 29)); + w1 = _mm256_srli_epi32(tmp,3); + _mm256_storeu_si256(compressed + 6, w0); + w1 = _mm256_or_si256(w1,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 14) ) , 14)); + tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 15) ) ; + w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 31)); + w0 = _mm256_srli_epi32(tmp,1); + _mm256_storeu_si256(compressed + 7, w1); + tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 16) ) ; + w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 16)); + w1 = _mm256_srli_epi32(tmp,16); + _mm256_storeu_si256(compressed + 8, w0); + w1 = _mm256_or_si256(w1,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 17) ) , 1)); + tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 18) ) ; + w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 18)); + w0 = _mm256_srli_epi32(tmp,14); + _mm256_storeu_si256(compressed + 9, w1); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 19) ) , 3)); + tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 20) ) ; + w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 20)); + w1 = _mm256_srli_epi32(tmp,12); + _mm256_storeu_si256(compressed + 10, w0); + w1 = _mm256_or_si256(w1,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 21) ) , 5)); + tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 22) ) ; + w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 22)); + w0 = _mm256_srli_epi32(tmp,10); + _mm256_storeu_si256(compressed + 11, w1); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 23) ) , 7)); + tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 24) ) ; + w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 24)); + w1 = _mm256_srli_epi32(tmp,8); + _mm256_storeu_si256(compressed + 12, w0); + w1 = _mm256_or_si256(w1,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 25) ) , 9)); + tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 26) ) ; + w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 26)); + w0 = _mm256_srli_epi32(tmp,6); + _mm256_storeu_si256(compressed + 13, w1); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 27) ) , 11)); + tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 28) ) ; + w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 28)); + w1 = _mm256_srli_epi32(tmp,4); + _mm256_storeu_si256(compressed + 14, w0); + w1 = _mm256_or_si256(w1,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 29) ) , 13)); + tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 30) ) ; + w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 30)); + w0 = _mm256_srli_epi32(tmp,2); + _mm256_storeu_si256(compressed + 15, w1); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 31) ) , 15)); + _mm256_storeu_si256(compressed + 16, w0); +} + + +/* we are going to pack 256 18-bit values, touching 18 256-bit words, using 288 bytes */ +static void avxpackblockmask18(const uint32_t * pin, __m256i * compressed) { + /* we are going to touch 18 256-bit words */ + __m256i w0, w1; + const __m256i * in = (const __m256i *) pin; + const __m256i mask = _mm256_set1_epi32(262143); + __m256i tmp; /* used to store inputs at word boundary */ + w0 = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 0) ) ; + tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 1) ) ; + w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 18)); + w1 = _mm256_srli_epi32(tmp,14); + _mm256_storeu_si256(compressed + 0, w0); + w1 = _mm256_or_si256(w1,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 2) ) , 4)); + tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 3) ) ; + w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 22)); + w0 = _mm256_srli_epi32(tmp,10); + _mm256_storeu_si256(compressed + 1, w1); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 4) ) , 8)); + tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 5) ) ; + w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 26)); + w1 = _mm256_srli_epi32(tmp,6); + _mm256_storeu_si256(compressed + 2, w0); + w1 = _mm256_or_si256(w1,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 6) ) , 12)); + tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 7) ) ; + w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 30)); + w0 = _mm256_srli_epi32(tmp,2); + _mm256_storeu_si256(compressed + 3, w1); + tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 8) ) ; + w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 16)); + w1 = _mm256_srli_epi32(tmp,16); + _mm256_storeu_si256(compressed + 4, w0); + w1 = _mm256_or_si256(w1,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 9) ) , 2)); + tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 10) ) ; + w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 20)); + w0 = _mm256_srli_epi32(tmp,12); + _mm256_storeu_si256(compressed + 5, w1); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 11) ) , 6)); + tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 12) ) ; + w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 24)); + w1 = _mm256_srli_epi32(tmp,8); + _mm256_storeu_si256(compressed + 6, w0); + w1 = _mm256_or_si256(w1,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 13) ) , 10)); + tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 14) ) ; + w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 28)); + w0 = _mm256_srli_epi32(tmp,4); + _mm256_storeu_si256(compressed + 7, w1); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 15) ) , 14)); + _mm256_storeu_si256(compressed + 8, w0); + w1 = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 16) ) ; + tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 17) ) ; + w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 18)); + w0 = _mm256_srli_epi32(tmp,14); + _mm256_storeu_si256(compressed + 9, w1); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 18) ) , 4)); + tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 19) ) ; + w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 22)); + w1 = _mm256_srli_epi32(tmp,10); + _mm256_storeu_si256(compressed + 10, w0); + w1 = _mm256_or_si256(w1,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 20) ) , 8)); + tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 21) ) ; + w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 26)); + w0 = _mm256_srli_epi32(tmp,6); + _mm256_storeu_si256(compressed + 11, w1); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 22) ) , 12)); + tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 23) ) ; + w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 30)); + w1 = _mm256_srli_epi32(tmp,2); + _mm256_storeu_si256(compressed + 12, w0); + tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 24) ) ; + w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 16)); + w0 = _mm256_srli_epi32(tmp,16); + _mm256_storeu_si256(compressed + 13, w1); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 25) ) , 2)); + tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 26) ) ; + w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 20)); + w1 = _mm256_srli_epi32(tmp,12); + _mm256_storeu_si256(compressed + 14, w0); + w1 = _mm256_or_si256(w1,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 27) ) , 6)); + tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 28) ) ; + w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 24)); + w0 = _mm256_srli_epi32(tmp,8); + _mm256_storeu_si256(compressed + 15, w1); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 29) ) , 10)); + tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 30) ) ; + w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 28)); + w1 = _mm256_srli_epi32(tmp,4); + _mm256_storeu_si256(compressed + 16, w0); + w1 = _mm256_or_si256(w1,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 31) ) , 14)); + _mm256_storeu_si256(compressed + 17, w1); +} + + +/* we are going to pack 256 19-bit values, touching 19 256-bit words, using 304 bytes */ +static void avxpackblockmask19(const uint32_t * pin, __m256i * compressed) { + /* we are going to touch 19 256-bit words */ + __m256i w0, w1; + const __m256i * in = (const __m256i *) pin; + const __m256i mask = _mm256_set1_epi32(524287); + __m256i tmp; /* used to store inputs at word boundary */ + w0 = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 0) ) ; + tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 1) ) ; + w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 19)); + w1 = _mm256_srli_epi32(tmp,13); + _mm256_storeu_si256(compressed + 0, w0); + w1 = _mm256_or_si256(w1,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 2) ) , 6)); + tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 3) ) ; + w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 25)); + w0 = _mm256_srli_epi32(tmp,7); + _mm256_storeu_si256(compressed + 1, w1); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 4) ) , 12)); + tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 5) ) ; + w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 31)); + w1 = _mm256_srli_epi32(tmp,1); + _mm256_storeu_si256(compressed + 2, w0); + tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 6) ) ; + w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 18)); + w0 = _mm256_srli_epi32(tmp,14); + _mm256_storeu_si256(compressed + 3, w1); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 7) ) , 5)); + tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 8) ) ; + w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 24)); + w1 = _mm256_srli_epi32(tmp,8); + _mm256_storeu_si256(compressed + 4, w0); + w1 = _mm256_or_si256(w1,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 9) ) , 11)); + tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 10) ) ; + w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 30)); + w0 = _mm256_srli_epi32(tmp,2); + _mm256_storeu_si256(compressed + 5, w1); + tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 11) ) ; + w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 17)); + w1 = _mm256_srli_epi32(tmp,15); + _mm256_storeu_si256(compressed + 6, w0); + w1 = _mm256_or_si256(w1,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 12) ) , 4)); + tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 13) ) ; + w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 23)); + w0 = _mm256_srli_epi32(tmp,9); + _mm256_storeu_si256(compressed + 7, w1); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 14) ) , 10)); + tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 15) ) ; + w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 29)); + w1 = _mm256_srli_epi32(tmp,3); + _mm256_storeu_si256(compressed + 8, w0); + tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 16) ) ; + w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 16)); + w0 = _mm256_srli_epi32(tmp,16); + _mm256_storeu_si256(compressed + 9, w1); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 17) ) , 3)); + tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 18) ) ; + w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 22)); + w1 = _mm256_srli_epi32(tmp,10); + _mm256_storeu_si256(compressed + 10, w0); + w1 = _mm256_or_si256(w1,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 19) ) , 9)); + tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 20) ) ; + w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 28)); + w0 = _mm256_srli_epi32(tmp,4); + _mm256_storeu_si256(compressed + 11, w1); + tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 21) ) ; + w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 15)); + w1 = _mm256_srli_epi32(tmp,17); + _mm256_storeu_si256(compressed + 12, w0); + w1 = _mm256_or_si256(w1,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 22) ) , 2)); + tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 23) ) ; + w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 21)); + w0 = _mm256_srli_epi32(tmp,11); + _mm256_storeu_si256(compressed + 13, w1); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 24) ) , 8)); + tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 25) ) ; + w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 27)); + w1 = _mm256_srli_epi32(tmp,5); + _mm256_storeu_si256(compressed + 14, w0); + tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 26) ) ; + w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 14)); + w0 = _mm256_srli_epi32(tmp,18); + _mm256_storeu_si256(compressed + 15, w1); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 27) ) , 1)); + tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 28) ) ; + w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 20)); + w1 = _mm256_srli_epi32(tmp,12); + _mm256_storeu_si256(compressed + 16, w0); + w1 = _mm256_or_si256(w1,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 29) ) , 7)); + tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 30) ) ; + w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 26)); + w0 = _mm256_srli_epi32(tmp,6); + _mm256_storeu_si256(compressed + 17, w1); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 31) ) , 13)); + _mm256_storeu_si256(compressed + 18, w0); +} + + +/* we are going to pack 256 20-bit values, touching 20 256-bit words, using 320 bytes */ +static void avxpackblockmask20(const uint32_t * pin, __m256i * compressed) { + /* we are going to touch 20 256-bit words */ + __m256i w0, w1; + const __m256i * in = (const __m256i *) pin; + const __m256i mask = _mm256_set1_epi32(1048575); + __m256i tmp; /* used to store inputs at word boundary */ + w0 = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 0) ) ; + tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 1) ) ; + w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 20)); + w1 = _mm256_srli_epi32(tmp,12); + _mm256_storeu_si256(compressed + 0, w0); + w1 = _mm256_or_si256(w1,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 2) ) , 8)); + tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 3) ) ; + w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 28)); + w0 = _mm256_srli_epi32(tmp,4); + _mm256_storeu_si256(compressed + 1, w1); + tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 4) ) ; + w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 16)); + w1 = _mm256_srli_epi32(tmp,16); + _mm256_storeu_si256(compressed + 2, w0); + w1 = _mm256_or_si256(w1,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 5) ) , 4)); + tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 6) ) ; + w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 24)); + w0 = _mm256_srli_epi32(tmp,8); + _mm256_storeu_si256(compressed + 3, w1); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 7) ) , 12)); + _mm256_storeu_si256(compressed + 4, w0); + w1 = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 8) ) ; + tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 9) ) ; + w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 20)); + w0 = _mm256_srli_epi32(tmp,12); + _mm256_storeu_si256(compressed + 5, w1); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 10) ) , 8)); + tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 11) ) ; + w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 28)); + w1 = _mm256_srli_epi32(tmp,4); + _mm256_storeu_si256(compressed + 6, w0); + tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 12) ) ; + w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 16)); + w0 = _mm256_srli_epi32(tmp,16); + _mm256_storeu_si256(compressed + 7, w1); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 13) ) , 4)); + tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 14) ) ; + w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 24)); + w1 = _mm256_srli_epi32(tmp,8); + _mm256_storeu_si256(compressed + 8, w0); + w1 = _mm256_or_si256(w1,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 15) ) , 12)); + _mm256_storeu_si256(compressed + 9, w1); + w0 = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 16) ) ; + tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 17) ) ; + w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 20)); + w1 = _mm256_srli_epi32(tmp,12); + _mm256_storeu_si256(compressed + 10, w0); + w1 = _mm256_or_si256(w1,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 18) ) , 8)); + tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 19) ) ; + w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 28)); + w0 = _mm256_srli_epi32(tmp,4); + _mm256_storeu_si256(compressed + 11, w1); + tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 20) ) ; + w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 16)); + w1 = _mm256_srli_epi32(tmp,16); + _mm256_storeu_si256(compressed + 12, w0); + w1 = _mm256_or_si256(w1,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 21) ) , 4)); + tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 22) ) ; + w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 24)); + w0 = _mm256_srli_epi32(tmp,8); + _mm256_storeu_si256(compressed + 13, w1); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 23) ) , 12)); + _mm256_storeu_si256(compressed + 14, w0); + w1 = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 24) ) ; + tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 25) ) ; + w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 20)); + w0 = _mm256_srli_epi32(tmp,12); + _mm256_storeu_si256(compressed + 15, w1); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 26) ) , 8)); + tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 27) ) ; + w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 28)); + w1 = _mm256_srli_epi32(tmp,4); + _mm256_storeu_si256(compressed + 16, w0); + tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 28) ) ; + w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 16)); + w0 = _mm256_srli_epi32(tmp,16); + _mm256_storeu_si256(compressed + 17, w1); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 29) ) , 4)); + tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 30) ) ; + w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 24)); + w1 = _mm256_srli_epi32(tmp,8); + _mm256_storeu_si256(compressed + 18, w0); + w1 = _mm256_or_si256(w1,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 31) ) , 12)); + _mm256_storeu_si256(compressed + 19, w1); +} + + +/* we are going to pack 256 21-bit values, touching 21 256-bit words, using 336 bytes */ +static void avxpackblockmask21(const uint32_t * pin, __m256i * compressed) { + /* we are going to touch 21 256-bit words */ + __m256i w0, w1; + const __m256i * in = (const __m256i *) pin; + const __m256i mask = _mm256_set1_epi32(2097151); + __m256i tmp; /* used to store inputs at word boundary */ + w0 = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 0) ) ; + tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 1) ) ; + w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 21)); + w1 = _mm256_srli_epi32(tmp,11); + _mm256_storeu_si256(compressed + 0, w0); + w1 = _mm256_or_si256(w1,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 2) ) , 10)); + tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 3) ) ; + w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 31)); + w0 = _mm256_srli_epi32(tmp,1); + _mm256_storeu_si256(compressed + 1, w1); + tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 4) ) ; + w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 20)); + w1 = _mm256_srli_epi32(tmp,12); + _mm256_storeu_si256(compressed + 2, w0); + w1 = _mm256_or_si256(w1,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 5) ) , 9)); + tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 6) ) ; + w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 30)); + w0 = _mm256_srli_epi32(tmp,2); + _mm256_storeu_si256(compressed + 3, w1); + tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 7) ) ; + w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 19)); + w1 = _mm256_srli_epi32(tmp,13); + _mm256_storeu_si256(compressed + 4, w0); + w1 = _mm256_or_si256(w1,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 8) ) , 8)); + tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 9) ) ; + w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 29)); + w0 = _mm256_srli_epi32(tmp,3); + _mm256_storeu_si256(compressed + 5, w1); + tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 10) ) ; + w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 18)); + w1 = _mm256_srli_epi32(tmp,14); + _mm256_storeu_si256(compressed + 6, w0); + w1 = _mm256_or_si256(w1,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 11) ) , 7)); + tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 12) ) ; + w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 28)); + w0 = _mm256_srli_epi32(tmp,4); + _mm256_storeu_si256(compressed + 7, w1); + tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 13) ) ; + w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 17)); + w1 = _mm256_srli_epi32(tmp,15); + _mm256_storeu_si256(compressed + 8, w0); + w1 = _mm256_or_si256(w1,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 14) ) , 6)); + tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 15) ) ; + w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 27)); + w0 = _mm256_srli_epi32(tmp,5); + _mm256_storeu_si256(compressed + 9, w1); + tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 16) ) ; + w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 16)); + w1 = _mm256_srli_epi32(tmp,16); + _mm256_storeu_si256(compressed + 10, w0); + w1 = _mm256_or_si256(w1,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 17) ) , 5)); + tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 18) ) ; + w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 26)); + w0 = _mm256_srli_epi32(tmp,6); + _mm256_storeu_si256(compressed + 11, w1); + tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 19) ) ; + w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 15)); + w1 = _mm256_srli_epi32(tmp,17); + _mm256_storeu_si256(compressed + 12, w0); + w1 = _mm256_or_si256(w1,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 20) ) , 4)); + tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 21) ) ; + w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 25)); + w0 = _mm256_srli_epi32(tmp,7); + _mm256_storeu_si256(compressed + 13, w1); + tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 22) ) ; + w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 14)); + w1 = _mm256_srli_epi32(tmp,18); + _mm256_storeu_si256(compressed + 14, w0); + w1 = _mm256_or_si256(w1,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 23) ) , 3)); + tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 24) ) ; + w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 24)); + w0 = _mm256_srli_epi32(tmp,8); + _mm256_storeu_si256(compressed + 15, w1); + tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 25) ) ; + w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 13)); + w1 = _mm256_srli_epi32(tmp,19); + _mm256_storeu_si256(compressed + 16, w0); + w1 = _mm256_or_si256(w1,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 26) ) , 2)); + tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 27) ) ; + w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 23)); + w0 = _mm256_srli_epi32(tmp,9); + _mm256_storeu_si256(compressed + 17, w1); + tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 28) ) ; + w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 12)); + w1 = _mm256_srli_epi32(tmp,20); + _mm256_storeu_si256(compressed + 18, w0); + w1 = _mm256_or_si256(w1,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 29) ) , 1)); + tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 30) ) ; + w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 22)); + w0 = _mm256_srli_epi32(tmp,10); + _mm256_storeu_si256(compressed + 19, w1); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 31) ) , 11)); + _mm256_storeu_si256(compressed + 20, w0); +} + + +/* we are going to pack 256 22-bit values, touching 22 256-bit words, using 352 bytes */ +static void avxpackblockmask22(const uint32_t * pin, __m256i * compressed) { + /* we are going to touch 22 256-bit words */ + __m256i w0, w1; + const __m256i * in = (const __m256i *) pin; + const __m256i mask = _mm256_set1_epi32(4194303); + __m256i tmp; /* used to store inputs at word boundary */ + w0 = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 0) ) ; + tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 1) ) ; + w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 22)); + w1 = _mm256_srli_epi32(tmp,10); + _mm256_storeu_si256(compressed + 0, w0); + tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 2) ) ; + w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 12)); + w0 = _mm256_srli_epi32(tmp,20); + _mm256_storeu_si256(compressed + 1, w1); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 3) ) , 2)); + tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 4) ) ; + w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 24)); + w1 = _mm256_srli_epi32(tmp,8); + _mm256_storeu_si256(compressed + 2, w0); + tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 5) ) ; + w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 14)); + w0 = _mm256_srli_epi32(tmp,18); + _mm256_storeu_si256(compressed + 3, w1); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 6) ) , 4)); + tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 7) ) ; + w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 26)); + w1 = _mm256_srli_epi32(tmp,6); + _mm256_storeu_si256(compressed + 4, w0); + tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 8) ) ; + w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 16)); + w0 = _mm256_srli_epi32(tmp,16); + _mm256_storeu_si256(compressed + 5, w1); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 9) ) , 6)); + tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 10) ) ; + w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 28)); + w1 = _mm256_srli_epi32(tmp,4); + _mm256_storeu_si256(compressed + 6, w0); + tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 11) ) ; + w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 18)); + w0 = _mm256_srli_epi32(tmp,14); + _mm256_storeu_si256(compressed + 7, w1); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 12) ) , 8)); + tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 13) ) ; + w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 30)); + w1 = _mm256_srli_epi32(tmp,2); + _mm256_storeu_si256(compressed + 8, w0); + tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 14) ) ; + w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 20)); + w0 = _mm256_srli_epi32(tmp,12); + _mm256_storeu_si256(compressed + 9, w1); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 15) ) , 10)); + _mm256_storeu_si256(compressed + 10, w0); + w1 = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 16) ) ; + tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 17) ) ; + w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 22)); + w0 = _mm256_srli_epi32(tmp,10); + _mm256_storeu_si256(compressed + 11, w1); + tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 18) ) ; + w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 12)); + w1 = _mm256_srli_epi32(tmp,20); + _mm256_storeu_si256(compressed + 12, w0); + w1 = _mm256_or_si256(w1,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 19) ) , 2)); + tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 20) ) ; + w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 24)); + w0 = _mm256_srli_epi32(tmp,8); + _mm256_storeu_si256(compressed + 13, w1); + tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 21) ) ; + w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 14)); + w1 = _mm256_srli_epi32(tmp,18); + _mm256_storeu_si256(compressed + 14, w0); + w1 = _mm256_or_si256(w1,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 22) ) , 4)); + tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 23) ) ; + w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 26)); + w0 = _mm256_srli_epi32(tmp,6); + _mm256_storeu_si256(compressed + 15, w1); + tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 24) ) ; + w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 16)); + w1 = _mm256_srli_epi32(tmp,16); + _mm256_storeu_si256(compressed + 16, w0); + w1 = _mm256_or_si256(w1,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 25) ) , 6)); + tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 26) ) ; + w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 28)); + w0 = _mm256_srli_epi32(tmp,4); + _mm256_storeu_si256(compressed + 17, w1); + tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 27) ) ; + w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 18)); + w1 = _mm256_srli_epi32(tmp,14); + _mm256_storeu_si256(compressed + 18, w0); + w1 = _mm256_or_si256(w1,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 28) ) , 8)); + tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 29) ) ; + w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 30)); + w0 = _mm256_srli_epi32(tmp,2); + _mm256_storeu_si256(compressed + 19, w1); + tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 30) ) ; + w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 20)); + w1 = _mm256_srli_epi32(tmp,12); + _mm256_storeu_si256(compressed + 20, w0); + w1 = _mm256_or_si256(w1,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 31) ) , 10)); + _mm256_storeu_si256(compressed + 21, w1); +} + + +/* we are going to pack 256 23-bit values, touching 23 256-bit words, using 368 bytes */ +static void avxpackblockmask23(const uint32_t * pin, __m256i * compressed) { + /* we are going to touch 23 256-bit words */ + __m256i w0, w1; + const __m256i * in = (const __m256i *) pin; + const __m256i mask = _mm256_set1_epi32(8388607); + __m256i tmp; /* used to store inputs at word boundary */ + w0 = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 0) ) ; + tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 1) ) ; + w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 23)); + w1 = _mm256_srli_epi32(tmp,9); + _mm256_storeu_si256(compressed + 0, w0); + tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 2) ) ; + w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 14)); + w0 = _mm256_srli_epi32(tmp,18); + _mm256_storeu_si256(compressed + 1, w1); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 3) ) , 5)); + tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 4) ) ; + w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 28)); + w1 = _mm256_srli_epi32(tmp,4); + _mm256_storeu_si256(compressed + 2, w0); + tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 5) ) ; + w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 19)); + w0 = _mm256_srli_epi32(tmp,13); + _mm256_storeu_si256(compressed + 3, w1); + tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 6) ) ; + w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 10)); + w1 = _mm256_srli_epi32(tmp,22); + _mm256_storeu_si256(compressed + 4, w0); + w1 = _mm256_or_si256(w1,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 7) ) , 1)); + tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 8) ) ; + w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 24)); + w0 = _mm256_srli_epi32(tmp,8); + _mm256_storeu_si256(compressed + 5, w1); + tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 9) ) ; + w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 15)); + w1 = _mm256_srli_epi32(tmp,17); + _mm256_storeu_si256(compressed + 6, w0); + w1 = _mm256_or_si256(w1,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 10) ) , 6)); + tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 11) ) ; + w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 29)); + w0 = _mm256_srli_epi32(tmp,3); + _mm256_storeu_si256(compressed + 7, w1); + tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 12) ) ; + w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 20)); + w1 = _mm256_srli_epi32(tmp,12); + _mm256_storeu_si256(compressed + 8, w0); + tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 13) ) ; + w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 11)); + w0 = _mm256_srli_epi32(tmp,21); + _mm256_storeu_si256(compressed + 9, w1); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 14) ) , 2)); + tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 15) ) ; + w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 25)); + w1 = _mm256_srli_epi32(tmp,7); + _mm256_storeu_si256(compressed + 10, w0); + tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 16) ) ; + w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 16)); + w0 = _mm256_srli_epi32(tmp,16); + _mm256_storeu_si256(compressed + 11, w1); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 17) ) , 7)); + tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 18) ) ; + w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 30)); + w1 = _mm256_srli_epi32(tmp,2); + _mm256_storeu_si256(compressed + 12, w0); + tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 19) ) ; + w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 21)); + w0 = _mm256_srli_epi32(tmp,11); + _mm256_storeu_si256(compressed + 13, w1); + tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 20) ) ; + w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 12)); + w1 = _mm256_srli_epi32(tmp,20); + _mm256_storeu_si256(compressed + 14, w0); + w1 = _mm256_or_si256(w1,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 21) ) , 3)); + tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 22) ) ; + w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 26)); + w0 = _mm256_srli_epi32(tmp,6); + _mm256_storeu_si256(compressed + 15, w1); + tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 23) ) ; + w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 17)); + w1 = _mm256_srli_epi32(tmp,15); + _mm256_storeu_si256(compressed + 16, w0); + w1 = _mm256_or_si256(w1,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 24) ) , 8)); + tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 25) ) ; + w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 31)); + w0 = _mm256_srli_epi32(tmp,1); + _mm256_storeu_si256(compressed + 17, w1); + tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 26) ) ; + w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 22)); + w1 = _mm256_srli_epi32(tmp,10); + _mm256_storeu_si256(compressed + 18, w0); + tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 27) ) ; + w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 13)); + w0 = _mm256_srli_epi32(tmp,19); + _mm256_storeu_si256(compressed + 19, w1); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 28) ) , 4)); + tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 29) ) ; + w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 27)); + w1 = _mm256_srli_epi32(tmp,5); + _mm256_storeu_si256(compressed + 20, w0); + tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 30) ) ; + w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 18)); + w0 = _mm256_srli_epi32(tmp,14); + _mm256_storeu_si256(compressed + 21, w1); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 31) ) , 9)); + _mm256_storeu_si256(compressed + 22, w0); +} + + +/* we are going to pack 256 24-bit values, touching 24 256-bit words, using 384 bytes */ +static void avxpackblockmask24(const uint32_t * pin, __m256i * compressed) { + /* we are going to touch 24 256-bit words */ + __m256i w0, w1; + const __m256i * in = (const __m256i *) pin; + const __m256i mask = _mm256_set1_epi32(16777215); + __m256i tmp; /* used to store inputs at word boundary */ + w0 = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 0) ) ; + tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 1) ) ; + w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 24)); + w1 = _mm256_srli_epi32(tmp,8); + _mm256_storeu_si256(compressed + 0, w0); + tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 2) ) ; + w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 16)); + w0 = _mm256_srli_epi32(tmp,16); + _mm256_storeu_si256(compressed + 1, w1); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 3) ) , 8)); + _mm256_storeu_si256(compressed + 2, w0); + w1 = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 4) ) ; + tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 5) ) ; + w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 24)); + w0 = _mm256_srli_epi32(tmp,8); + _mm256_storeu_si256(compressed + 3, w1); + tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 6) ) ; + w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 16)); + w1 = _mm256_srli_epi32(tmp,16); + _mm256_storeu_si256(compressed + 4, w0); + w1 = _mm256_or_si256(w1,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 7) ) , 8)); + _mm256_storeu_si256(compressed + 5, w1); + w0 = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 8) ) ; + tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 9) ) ; + w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 24)); + w1 = _mm256_srli_epi32(tmp,8); + _mm256_storeu_si256(compressed + 6, w0); + tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 10) ) ; + w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 16)); + w0 = _mm256_srli_epi32(tmp,16); + _mm256_storeu_si256(compressed + 7, w1); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 11) ) , 8)); + _mm256_storeu_si256(compressed + 8, w0); + w1 = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 12) ) ; + tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 13) ) ; + w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 24)); + w0 = _mm256_srli_epi32(tmp,8); + _mm256_storeu_si256(compressed + 9, w1); + tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 14) ) ; + w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 16)); + w1 = _mm256_srli_epi32(tmp,16); + _mm256_storeu_si256(compressed + 10, w0); + w1 = _mm256_or_si256(w1,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 15) ) , 8)); + _mm256_storeu_si256(compressed + 11, w1); + w0 = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 16) ) ; + tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 17) ) ; + w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 24)); + w1 = _mm256_srli_epi32(tmp,8); + _mm256_storeu_si256(compressed + 12, w0); + tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 18) ) ; + w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 16)); + w0 = _mm256_srli_epi32(tmp,16); + _mm256_storeu_si256(compressed + 13, w1); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 19) ) , 8)); + _mm256_storeu_si256(compressed + 14, w0); + w1 = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 20) ) ; + tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 21) ) ; + w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 24)); + w0 = _mm256_srli_epi32(tmp,8); + _mm256_storeu_si256(compressed + 15, w1); + tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 22) ) ; + w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 16)); + w1 = _mm256_srli_epi32(tmp,16); + _mm256_storeu_si256(compressed + 16, w0); + w1 = _mm256_or_si256(w1,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 23) ) , 8)); + _mm256_storeu_si256(compressed + 17, w1); + w0 = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 24) ) ; + tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 25) ) ; + w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 24)); + w1 = _mm256_srli_epi32(tmp,8); + _mm256_storeu_si256(compressed + 18, w0); + tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 26) ) ; + w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 16)); + w0 = _mm256_srli_epi32(tmp,16); + _mm256_storeu_si256(compressed + 19, w1); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 27) ) , 8)); + _mm256_storeu_si256(compressed + 20, w0); + w1 = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 28) ) ; + tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 29) ) ; + w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 24)); + w0 = _mm256_srli_epi32(tmp,8); + _mm256_storeu_si256(compressed + 21, w1); + tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 30) ) ; + w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 16)); + w1 = _mm256_srli_epi32(tmp,16); + _mm256_storeu_si256(compressed + 22, w0); + w1 = _mm256_or_si256(w1,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 31) ) , 8)); + _mm256_storeu_si256(compressed + 23, w1); +} + + +/* we are going to pack 256 25-bit values, touching 25 256-bit words, using 400 bytes */ +static void avxpackblockmask25(const uint32_t * pin, __m256i * compressed) { + /* we are going to touch 25 256-bit words */ + __m256i w0, w1; + const __m256i * in = (const __m256i *) pin; + const __m256i mask = _mm256_set1_epi32(33554431); + __m256i tmp; /* used to store inputs at word boundary */ + w0 = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 0) ) ; + tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 1) ) ; + w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 25)); + w1 = _mm256_srli_epi32(tmp,7); + _mm256_storeu_si256(compressed + 0, w0); + tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 2) ) ; + w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 18)); + w0 = _mm256_srli_epi32(tmp,14); + _mm256_storeu_si256(compressed + 1, w1); + tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 3) ) ; + w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 11)); + w1 = _mm256_srli_epi32(tmp,21); + _mm256_storeu_si256(compressed + 2, w0); + w1 = _mm256_or_si256(w1,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 4) ) , 4)); + tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 5) ) ; + w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 29)); + w0 = _mm256_srli_epi32(tmp,3); + _mm256_storeu_si256(compressed + 3, w1); + tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 6) ) ; + w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 22)); + w1 = _mm256_srli_epi32(tmp,10); + _mm256_storeu_si256(compressed + 4, w0); + tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 7) ) ; + w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 15)); + w0 = _mm256_srli_epi32(tmp,17); + _mm256_storeu_si256(compressed + 5, w1); + tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 8) ) ; + w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 8)); + w1 = _mm256_srli_epi32(tmp,24); + _mm256_storeu_si256(compressed + 6, w0); + w1 = _mm256_or_si256(w1,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 9) ) , 1)); + tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 10) ) ; + w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 26)); + w0 = _mm256_srli_epi32(tmp,6); + _mm256_storeu_si256(compressed + 7, w1); + tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 11) ) ; + w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 19)); + w1 = _mm256_srli_epi32(tmp,13); + _mm256_storeu_si256(compressed + 8, w0); + tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 12) ) ; + w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 12)); + w0 = _mm256_srli_epi32(tmp,20); + _mm256_storeu_si256(compressed + 9, w1); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 13) ) , 5)); + tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 14) ) ; + w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 30)); + w1 = _mm256_srli_epi32(tmp,2); + _mm256_storeu_si256(compressed + 10, w0); + tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 15) ) ; + w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 23)); + w0 = _mm256_srli_epi32(tmp,9); + _mm256_storeu_si256(compressed + 11, w1); + tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 16) ) ; + w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 16)); + w1 = _mm256_srli_epi32(tmp,16); + _mm256_storeu_si256(compressed + 12, w0); + tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 17) ) ; + w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 9)); + w0 = _mm256_srli_epi32(tmp,23); + _mm256_storeu_si256(compressed + 13, w1); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 18) ) , 2)); + tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 19) ) ; + w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 27)); + w1 = _mm256_srli_epi32(tmp,5); + _mm256_storeu_si256(compressed + 14, w0); + tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 20) ) ; + w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 20)); + w0 = _mm256_srli_epi32(tmp,12); + _mm256_storeu_si256(compressed + 15, w1); + tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 21) ) ; + w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 13)); + w1 = _mm256_srli_epi32(tmp,19); + _mm256_storeu_si256(compressed + 16, w0); + w1 = _mm256_or_si256(w1,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 22) ) , 6)); + tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 23) ) ; + w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 31)); + w0 = _mm256_srli_epi32(tmp,1); + _mm256_storeu_si256(compressed + 17, w1); + tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 24) ) ; + w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 24)); + w1 = _mm256_srli_epi32(tmp,8); + _mm256_storeu_si256(compressed + 18, w0); + tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 25) ) ; + w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 17)); + w0 = _mm256_srli_epi32(tmp,15); + _mm256_storeu_si256(compressed + 19, w1); + tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 26) ) ; + w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 10)); + w1 = _mm256_srli_epi32(tmp,22); + _mm256_storeu_si256(compressed + 20, w0); + w1 = _mm256_or_si256(w1,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 27) ) , 3)); + tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 28) ) ; + w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 28)); + w0 = _mm256_srli_epi32(tmp,4); + _mm256_storeu_si256(compressed + 21, w1); + tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 29) ) ; + w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 21)); + w1 = _mm256_srli_epi32(tmp,11); + _mm256_storeu_si256(compressed + 22, w0); + tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 30) ) ; + w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 14)); + w0 = _mm256_srli_epi32(tmp,18); + _mm256_storeu_si256(compressed + 23, w1); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 31) ) , 7)); + _mm256_storeu_si256(compressed + 24, w0); +} + + +/* we are going to pack 256 26-bit values, touching 26 256-bit words, using 416 bytes */ +static void avxpackblockmask26(const uint32_t * pin, __m256i * compressed) { + /* we are going to touch 26 256-bit words */ + __m256i w0, w1; + const __m256i * in = (const __m256i *) pin; + const __m256i mask = _mm256_set1_epi32(67108863); + __m256i tmp; /* used to store inputs at word boundary */ + w0 = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 0) ) ; + tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 1) ) ; + w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 26)); + w1 = _mm256_srli_epi32(tmp,6); + _mm256_storeu_si256(compressed + 0, w0); + tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 2) ) ; + w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 20)); + w0 = _mm256_srli_epi32(tmp,12); + _mm256_storeu_si256(compressed + 1, w1); + tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 3) ) ; + w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 14)); + w1 = _mm256_srli_epi32(tmp,18); + _mm256_storeu_si256(compressed + 2, w0); + tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 4) ) ; + w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 8)); + w0 = _mm256_srli_epi32(tmp,24); + _mm256_storeu_si256(compressed + 3, w1); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 5) ) , 2)); + tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 6) ) ; + w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 28)); + w1 = _mm256_srli_epi32(tmp,4); + _mm256_storeu_si256(compressed + 4, w0); + tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 7) ) ; + w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 22)); + w0 = _mm256_srli_epi32(tmp,10); + _mm256_storeu_si256(compressed + 5, w1); + tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 8) ) ; + w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 16)); + w1 = _mm256_srli_epi32(tmp,16); + _mm256_storeu_si256(compressed + 6, w0); + tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 9) ) ; + w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 10)); + w0 = _mm256_srli_epi32(tmp,22); + _mm256_storeu_si256(compressed + 7, w1); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 10) ) , 4)); + tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 11) ) ; + w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 30)); + w1 = _mm256_srli_epi32(tmp,2); + _mm256_storeu_si256(compressed + 8, w0); + tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 12) ) ; + w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 24)); + w0 = _mm256_srli_epi32(tmp,8); + _mm256_storeu_si256(compressed + 9, w1); + tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 13) ) ; + w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 18)); + w1 = _mm256_srli_epi32(tmp,14); + _mm256_storeu_si256(compressed + 10, w0); + tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 14) ) ; + w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 12)); + w0 = _mm256_srli_epi32(tmp,20); + _mm256_storeu_si256(compressed + 11, w1); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 15) ) , 6)); + _mm256_storeu_si256(compressed + 12, w0); + w1 = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 16) ) ; + tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 17) ) ; + w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 26)); + w0 = _mm256_srli_epi32(tmp,6); + _mm256_storeu_si256(compressed + 13, w1); + tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 18) ) ; + w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 20)); + w1 = _mm256_srli_epi32(tmp,12); + _mm256_storeu_si256(compressed + 14, w0); + tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 19) ) ; + w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 14)); + w0 = _mm256_srli_epi32(tmp,18); + _mm256_storeu_si256(compressed + 15, w1); + tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 20) ) ; + w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 8)); + w1 = _mm256_srli_epi32(tmp,24); + _mm256_storeu_si256(compressed + 16, w0); + w1 = _mm256_or_si256(w1,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 21) ) , 2)); + tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 22) ) ; + w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 28)); + w0 = _mm256_srli_epi32(tmp,4); + _mm256_storeu_si256(compressed + 17, w1); + tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 23) ) ; + w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 22)); + w1 = _mm256_srli_epi32(tmp,10); + _mm256_storeu_si256(compressed + 18, w0); + tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 24) ) ; + w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 16)); + w0 = _mm256_srli_epi32(tmp,16); + _mm256_storeu_si256(compressed + 19, w1); + tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 25) ) ; + w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 10)); + w1 = _mm256_srli_epi32(tmp,22); + _mm256_storeu_si256(compressed + 20, w0); + w1 = _mm256_or_si256(w1,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 26) ) , 4)); + tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 27) ) ; + w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 30)); + w0 = _mm256_srli_epi32(tmp,2); + _mm256_storeu_si256(compressed + 21, w1); + tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 28) ) ; + w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 24)); + w1 = _mm256_srli_epi32(tmp,8); + _mm256_storeu_si256(compressed + 22, w0); + tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 29) ) ; + w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 18)); + w0 = _mm256_srli_epi32(tmp,14); + _mm256_storeu_si256(compressed + 23, w1); + tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 30) ) ; + w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 12)); + w1 = _mm256_srli_epi32(tmp,20); + _mm256_storeu_si256(compressed + 24, w0); + w1 = _mm256_or_si256(w1,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 31) ) , 6)); + _mm256_storeu_si256(compressed + 25, w1); +} + + +/* we are going to pack 256 27-bit values, touching 27 256-bit words, using 432 bytes */ +static void avxpackblockmask27(const uint32_t * pin, __m256i * compressed) { + /* we are going to touch 27 256-bit words */ + __m256i w0, w1; + const __m256i * in = (const __m256i *) pin; + const __m256i mask = _mm256_set1_epi32(134217727); + __m256i tmp; /* used to store inputs at word boundary */ + w0 = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 0) ) ; + tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 1) ) ; + w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 27)); + w1 = _mm256_srli_epi32(tmp,5); + _mm256_storeu_si256(compressed + 0, w0); + tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 2) ) ; + w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 22)); + w0 = _mm256_srli_epi32(tmp,10); + _mm256_storeu_si256(compressed + 1, w1); + tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 3) ) ; + w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 17)); + w1 = _mm256_srli_epi32(tmp,15); + _mm256_storeu_si256(compressed + 2, w0); + tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 4) ) ; + w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 12)); + w0 = _mm256_srli_epi32(tmp,20); + _mm256_storeu_si256(compressed + 3, w1); + tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 5) ) ; + w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 7)); + w1 = _mm256_srli_epi32(tmp,25); + _mm256_storeu_si256(compressed + 4, w0); + w1 = _mm256_or_si256(w1,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 6) ) , 2)); + tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 7) ) ; + w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 29)); + w0 = _mm256_srli_epi32(tmp,3); + _mm256_storeu_si256(compressed + 5, w1); + tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 8) ) ; + w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 24)); + w1 = _mm256_srli_epi32(tmp,8); + _mm256_storeu_si256(compressed + 6, w0); + tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 9) ) ; + w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 19)); + w0 = _mm256_srli_epi32(tmp,13); + _mm256_storeu_si256(compressed + 7, w1); + tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 10) ) ; + w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 14)); + w1 = _mm256_srli_epi32(tmp,18); + _mm256_storeu_si256(compressed + 8, w0); + tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 11) ) ; + w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 9)); + w0 = _mm256_srli_epi32(tmp,23); + _mm256_storeu_si256(compressed + 9, w1); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 12) ) , 4)); + tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 13) ) ; + w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 31)); + w1 = _mm256_srli_epi32(tmp,1); + _mm256_storeu_si256(compressed + 10, w0); + tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 14) ) ; + w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 26)); + w0 = _mm256_srli_epi32(tmp,6); + _mm256_storeu_si256(compressed + 11, w1); + tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 15) ) ; + w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 21)); + w1 = _mm256_srli_epi32(tmp,11); + _mm256_storeu_si256(compressed + 12, w0); + tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 16) ) ; + w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 16)); + w0 = _mm256_srli_epi32(tmp,16); + _mm256_storeu_si256(compressed + 13, w1); + tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 17) ) ; + w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 11)); + w1 = _mm256_srli_epi32(tmp,21); + _mm256_storeu_si256(compressed + 14, w0); + tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 18) ) ; + w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 6)); + w0 = _mm256_srli_epi32(tmp,26); + _mm256_storeu_si256(compressed + 15, w1); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 19) ) , 1)); + tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 20) ) ; + w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 28)); + w1 = _mm256_srli_epi32(tmp,4); + _mm256_storeu_si256(compressed + 16, w0); + tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 21) ) ; + w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 23)); + w0 = _mm256_srli_epi32(tmp,9); + _mm256_storeu_si256(compressed + 17, w1); + tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 22) ) ; + w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 18)); + w1 = _mm256_srli_epi32(tmp,14); + _mm256_storeu_si256(compressed + 18, w0); + tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 23) ) ; + w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 13)); + w0 = _mm256_srli_epi32(tmp,19); + _mm256_storeu_si256(compressed + 19, w1); + tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 24) ) ; + w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 8)); + w1 = _mm256_srli_epi32(tmp,24); + _mm256_storeu_si256(compressed + 20, w0); + w1 = _mm256_or_si256(w1,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 25) ) , 3)); + tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 26) ) ; + w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 30)); + w0 = _mm256_srli_epi32(tmp,2); + _mm256_storeu_si256(compressed + 21, w1); + tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 27) ) ; + w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 25)); + w1 = _mm256_srli_epi32(tmp,7); + _mm256_storeu_si256(compressed + 22, w0); + tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 28) ) ; + w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 20)); + w0 = _mm256_srli_epi32(tmp,12); + _mm256_storeu_si256(compressed + 23, w1); + tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 29) ) ; + w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 15)); + w1 = _mm256_srli_epi32(tmp,17); + _mm256_storeu_si256(compressed + 24, w0); + tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 30) ) ; + w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 10)); + w0 = _mm256_srli_epi32(tmp,22); + _mm256_storeu_si256(compressed + 25, w1); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 31) ) , 5)); + _mm256_storeu_si256(compressed + 26, w0); +} + + +/* we are going to pack 256 28-bit values, touching 28 256-bit words, using 448 bytes */ +static void avxpackblockmask28(const uint32_t * pin, __m256i * compressed) { + /* we are going to touch 28 256-bit words */ + __m256i w0, w1; + const __m256i * in = (const __m256i *) pin; + const __m256i mask = _mm256_set1_epi32(268435455); + __m256i tmp; /* used to store inputs at word boundary */ + w0 = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 0) ) ; + tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 1) ) ; + w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 28)); + w1 = _mm256_srli_epi32(tmp,4); + _mm256_storeu_si256(compressed + 0, w0); + tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 2) ) ; + w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 24)); + w0 = _mm256_srli_epi32(tmp,8); + _mm256_storeu_si256(compressed + 1, w1); + tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 3) ) ; + w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 20)); + w1 = _mm256_srli_epi32(tmp,12); + _mm256_storeu_si256(compressed + 2, w0); + tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 4) ) ; + w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 16)); + w0 = _mm256_srli_epi32(tmp,16); + _mm256_storeu_si256(compressed + 3, w1); + tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 5) ) ; + w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 12)); + w1 = _mm256_srli_epi32(tmp,20); + _mm256_storeu_si256(compressed + 4, w0); + tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 6) ) ; + w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 8)); + w0 = _mm256_srli_epi32(tmp,24); + _mm256_storeu_si256(compressed + 5, w1); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 7) ) , 4)); + _mm256_storeu_si256(compressed + 6, w0); + w1 = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 8) ) ; + tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 9) ) ; + w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 28)); + w0 = _mm256_srli_epi32(tmp,4); + _mm256_storeu_si256(compressed + 7, w1); + tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 10) ) ; + w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 24)); + w1 = _mm256_srli_epi32(tmp,8); + _mm256_storeu_si256(compressed + 8, w0); + tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 11) ) ; + w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 20)); + w0 = _mm256_srli_epi32(tmp,12); + _mm256_storeu_si256(compressed + 9, w1); + tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 12) ) ; + w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 16)); + w1 = _mm256_srli_epi32(tmp,16); + _mm256_storeu_si256(compressed + 10, w0); + tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 13) ) ; + w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 12)); + w0 = _mm256_srli_epi32(tmp,20); + _mm256_storeu_si256(compressed + 11, w1); + tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 14) ) ; + w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 8)); + w1 = _mm256_srli_epi32(tmp,24); + _mm256_storeu_si256(compressed + 12, w0); + w1 = _mm256_or_si256(w1,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 15) ) , 4)); + _mm256_storeu_si256(compressed + 13, w1); + w0 = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 16) ) ; + tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 17) ) ; + w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 28)); + w1 = _mm256_srli_epi32(tmp,4); + _mm256_storeu_si256(compressed + 14, w0); + tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 18) ) ; + w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 24)); + w0 = _mm256_srli_epi32(tmp,8); + _mm256_storeu_si256(compressed + 15, w1); + tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 19) ) ; + w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 20)); + w1 = _mm256_srli_epi32(tmp,12); + _mm256_storeu_si256(compressed + 16, w0); + tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 20) ) ; + w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 16)); + w0 = _mm256_srli_epi32(tmp,16); + _mm256_storeu_si256(compressed + 17, w1); + tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 21) ) ; + w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 12)); + w1 = _mm256_srli_epi32(tmp,20); + _mm256_storeu_si256(compressed + 18, w0); + tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 22) ) ; + w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 8)); + w0 = _mm256_srli_epi32(tmp,24); + _mm256_storeu_si256(compressed + 19, w1); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 23) ) , 4)); + _mm256_storeu_si256(compressed + 20, w0); + w1 = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 24) ) ; + tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 25) ) ; + w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 28)); + w0 = _mm256_srli_epi32(tmp,4); + _mm256_storeu_si256(compressed + 21, w1); + tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 26) ) ; + w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 24)); + w1 = _mm256_srli_epi32(tmp,8); + _mm256_storeu_si256(compressed + 22, w0); + tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 27) ) ; + w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 20)); + w0 = _mm256_srli_epi32(tmp,12); + _mm256_storeu_si256(compressed + 23, w1); + tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 28) ) ; + w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 16)); + w1 = _mm256_srli_epi32(tmp,16); + _mm256_storeu_si256(compressed + 24, w0); + tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 29) ) ; + w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 12)); + w0 = _mm256_srli_epi32(tmp,20); + _mm256_storeu_si256(compressed + 25, w1); + tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 30) ) ; + w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 8)); + w1 = _mm256_srli_epi32(tmp,24); + _mm256_storeu_si256(compressed + 26, w0); + w1 = _mm256_or_si256(w1,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 31) ) , 4)); + _mm256_storeu_si256(compressed + 27, w1); +} + + +/* we are going to pack 256 29-bit values, touching 29 256-bit words, using 464 bytes */ +static void avxpackblockmask29(const uint32_t * pin, __m256i * compressed) { + /* we are going to touch 29 256-bit words */ + __m256i w0, w1; + const __m256i * in = (const __m256i *) pin; + const __m256i mask = _mm256_set1_epi32(536870911); + __m256i tmp; /* used to store inputs at word boundary */ + w0 = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 0) ) ; + tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 1) ) ; + w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 29)); + w1 = _mm256_srli_epi32(tmp,3); + _mm256_storeu_si256(compressed + 0, w0); + tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 2) ) ; + w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 26)); + w0 = _mm256_srli_epi32(tmp,6); + _mm256_storeu_si256(compressed + 1, w1); + tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 3) ) ; + w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 23)); + w1 = _mm256_srli_epi32(tmp,9); + _mm256_storeu_si256(compressed + 2, w0); + tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 4) ) ; + w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 20)); + w0 = _mm256_srli_epi32(tmp,12); + _mm256_storeu_si256(compressed + 3, w1); + tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 5) ) ; + w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 17)); + w1 = _mm256_srli_epi32(tmp,15); + _mm256_storeu_si256(compressed + 4, w0); + tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 6) ) ; + w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 14)); + w0 = _mm256_srli_epi32(tmp,18); + _mm256_storeu_si256(compressed + 5, w1); + tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 7) ) ; + w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 11)); + w1 = _mm256_srli_epi32(tmp,21); + _mm256_storeu_si256(compressed + 6, w0); + tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 8) ) ; + w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 8)); + w0 = _mm256_srli_epi32(tmp,24); + _mm256_storeu_si256(compressed + 7, w1); + tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 9) ) ; + w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 5)); + w1 = _mm256_srli_epi32(tmp,27); + _mm256_storeu_si256(compressed + 8, w0); + w1 = _mm256_or_si256(w1,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 10) ) , 2)); + tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 11) ) ; + w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 31)); + w0 = _mm256_srli_epi32(tmp,1); + _mm256_storeu_si256(compressed + 9, w1); + tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 12) ) ; + w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 28)); + w1 = _mm256_srli_epi32(tmp,4); + _mm256_storeu_si256(compressed + 10, w0); + tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 13) ) ; + w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 25)); + w0 = _mm256_srli_epi32(tmp,7); + _mm256_storeu_si256(compressed + 11, w1); + tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 14) ) ; + w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 22)); + w1 = _mm256_srli_epi32(tmp,10); + _mm256_storeu_si256(compressed + 12, w0); + tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 15) ) ; + w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 19)); + w0 = _mm256_srli_epi32(tmp,13); + _mm256_storeu_si256(compressed + 13, w1); + tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 16) ) ; + w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 16)); + w1 = _mm256_srli_epi32(tmp,16); + _mm256_storeu_si256(compressed + 14, w0); + tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 17) ) ; + w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 13)); + w0 = _mm256_srli_epi32(tmp,19); + _mm256_storeu_si256(compressed + 15, w1); + tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 18) ) ; + w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 10)); + w1 = _mm256_srli_epi32(tmp,22); + _mm256_storeu_si256(compressed + 16, w0); + tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 19) ) ; + w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 7)); + w0 = _mm256_srli_epi32(tmp,25); + _mm256_storeu_si256(compressed + 17, w1); + tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 20) ) ; + w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 4)); + w1 = _mm256_srli_epi32(tmp,28); + _mm256_storeu_si256(compressed + 18, w0); + w1 = _mm256_or_si256(w1,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 21) ) , 1)); + tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 22) ) ; + w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 30)); + w0 = _mm256_srli_epi32(tmp,2); + _mm256_storeu_si256(compressed + 19, w1); + tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 23) ) ; + w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 27)); + w1 = _mm256_srli_epi32(tmp,5); + _mm256_storeu_si256(compressed + 20, w0); + tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 24) ) ; + w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 24)); + w0 = _mm256_srli_epi32(tmp,8); + _mm256_storeu_si256(compressed + 21, w1); + tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 25) ) ; + w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 21)); + w1 = _mm256_srli_epi32(tmp,11); + _mm256_storeu_si256(compressed + 22, w0); + tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 26) ) ; + w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 18)); + w0 = _mm256_srli_epi32(tmp,14); + _mm256_storeu_si256(compressed + 23, w1); + tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 27) ) ; + w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 15)); + w1 = _mm256_srli_epi32(tmp,17); + _mm256_storeu_si256(compressed + 24, w0); + tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 28) ) ; + w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 12)); + w0 = _mm256_srli_epi32(tmp,20); + _mm256_storeu_si256(compressed + 25, w1); + tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 29) ) ; + w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 9)); + w1 = _mm256_srli_epi32(tmp,23); + _mm256_storeu_si256(compressed + 26, w0); + tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 30) ) ; + w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 6)); + w0 = _mm256_srli_epi32(tmp,26); + _mm256_storeu_si256(compressed + 27, w1); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 31) ) , 3)); + _mm256_storeu_si256(compressed + 28, w0); +} + + +/* we are going to pack 256 30-bit values, touching 30 256-bit words, using 480 bytes */ +static void avxpackblockmask30(const uint32_t * pin, __m256i * compressed) { + /* we are going to touch 30 256-bit words */ + __m256i w0, w1; + const __m256i * in = (const __m256i *) pin; + const __m256i mask = _mm256_set1_epi32(1073741823); + __m256i tmp; /* used to store inputs at word boundary */ + w0 = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 0) ) ; + tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 1) ) ; + w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 30)); + w1 = _mm256_srli_epi32(tmp,2); + _mm256_storeu_si256(compressed + 0, w0); + tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 2) ) ; + w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 28)); + w0 = _mm256_srli_epi32(tmp,4); + _mm256_storeu_si256(compressed + 1, w1); + tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 3) ) ; + w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 26)); + w1 = _mm256_srli_epi32(tmp,6); + _mm256_storeu_si256(compressed + 2, w0); + tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 4) ) ; + w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 24)); + w0 = _mm256_srli_epi32(tmp,8); + _mm256_storeu_si256(compressed + 3, w1); + tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 5) ) ; + w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 22)); + w1 = _mm256_srli_epi32(tmp,10); + _mm256_storeu_si256(compressed + 4, w0); + tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 6) ) ; + w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 20)); + w0 = _mm256_srli_epi32(tmp,12); + _mm256_storeu_si256(compressed + 5, w1); + tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 7) ) ; + w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 18)); + w1 = _mm256_srli_epi32(tmp,14); + _mm256_storeu_si256(compressed + 6, w0); + tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 8) ) ; + w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 16)); + w0 = _mm256_srli_epi32(tmp,16); + _mm256_storeu_si256(compressed + 7, w1); + tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 9) ) ; + w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 14)); + w1 = _mm256_srli_epi32(tmp,18); + _mm256_storeu_si256(compressed + 8, w0); + tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 10) ) ; + w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 12)); + w0 = _mm256_srli_epi32(tmp,20); + _mm256_storeu_si256(compressed + 9, w1); + tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 11) ) ; + w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 10)); + w1 = _mm256_srli_epi32(tmp,22); + _mm256_storeu_si256(compressed + 10, w0); + tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 12) ) ; + w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 8)); + w0 = _mm256_srli_epi32(tmp,24); + _mm256_storeu_si256(compressed + 11, w1); + tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 13) ) ; + w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 6)); + w1 = _mm256_srli_epi32(tmp,26); + _mm256_storeu_si256(compressed + 12, w0); + tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 14) ) ; + w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 4)); + w0 = _mm256_srli_epi32(tmp,28); + _mm256_storeu_si256(compressed + 13, w1); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 15) ) , 2)); + _mm256_storeu_si256(compressed + 14, w0); + w1 = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 16) ) ; + tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 17) ) ; + w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 30)); + w0 = _mm256_srli_epi32(tmp,2); + _mm256_storeu_si256(compressed + 15, w1); + tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 18) ) ; + w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 28)); + w1 = _mm256_srli_epi32(tmp,4); + _mm256_storeu_si256(compressed + 16, w0); + tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 19) ) ; + w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 26)); + w0 = _mm256_srli_epi32(tmp,6); + _mm256_storeu_si256(compressed + 17, w1); + tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 20) ) ; + w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 24)); + w1 = _mm256_srli_epi32(tmp,8); + _mm256_storeu_si256(compressed + 18, w0); + tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 21) ) ; + w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 22)); + w0 = _mm256_srli_epi32(tmp,10); + _mm256_storeu_si256(compressed + 19, w1); + tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 22) ) ; + w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 20)); + w1 = _mm256_srli_epi32(tmp,12); + _mm256_storeu_si256(compressed + 20, w0); + tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 23) ) ; + w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 18)); + w0 = _mm256_srli_epi32(tmp,14); + _mm256_storeu_si256(compressed + 21, w1); + tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 24) ) ; + w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 16)); + w1 = _mm256_srli_epi32(tmp,16); + _mm256_storeu_si256(compressed + 22, w0); + tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 25) ) ; + w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 14)); + w0 = _mm256_srli_epi32(tmp,18); + _mm256_storeu_si256(compressed + 23, w1); + tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 26) ) ; + w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 12)); + w1 = _mm256_srli_epi32(tmp,20); + _mm256_storeu_si256(compressed + 24, w0); + tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 27) ) ; + w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 10)); + w0 = _mm256_srli_epi32(tmp,22); + _mm256_storeu_si256(compressed + 25, w1); + tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 28) ) ; + w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 8)); + w1 = _mm256_srli_epi32(tmp,24); + _mm256_storeu_si256(compressed + 26, w0); + tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 29) ) ; + w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 6)); + w0 = _mm256_srli_epi32(tmp,26); + _mm256_storeu_si256(compressed + 27, w1); + tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 30) ) ; + w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 4)); + w1 = _mm256_srli_epi32(tmp,28); + _mm256_storeu_si256(compressed + 28, w0); + w1 = _mm256_or_si256(w1,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 31) ) , 2)); + _mm256_storeu_si256(compressed + 29, w1); +} + + +/* we are going to pack 256 31-bit values, touching 31 256-bit words, using 496 bytes */ +static void avxpackblockmask31(const uint32_t * pin, __m256i * compressed) { + /* we are going to touch 31 256-bit words */ + __m256i w0, w1; + const __m256i * in = (const __m256i *) pin; + const __m256i mask = _mm256_set1_epi32(2147483647); + __m256i tmp; /* used to store inputs at word boundary */ + w0 = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 0) ) ; + tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 1) ) ; + w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 31)); + w1 = _mm256_srli_epi32(tmp,1); + _mm256_storeu_si256(compressed + 0, w0); + tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 2) ) ; + w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 30)); + w0 = _mm256_srli_epi32(tmp,2); + _mm256_storeu_si256(compressed + 1, w1); + tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 3) ) ; + w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 29)); + w1 = _mm256_srli_epi32(tmp,3); + _mm256_storeu_si256(compressed + 2, w0); + tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 4) ) ; + w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 28)); + w0 = _mm256_srli_epi32(tmp,4); + _mm256_storeu_si256(compressed + 3, w1); + tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 5) ) ; + w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 27)); + w1 = _mm256_srli_epi32(tmp,5); + _mm256_storeu_si256(compressed + 4, w0); + tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 6) ) ; + w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 26)); + w0 = _mm256_srli_epi32(tmp,6); + _mm256_storeu_si256(compressed + 5, w1); + tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 7) ) ; + w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 25)); + w1 = _mm256_srli_epi32(tmp,7); + _mm256_storeu_si256(compressed + 6, w0); + tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 8) ) ; + w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 24)); + w0 = _mm256_srli_epi32(tmp,8); + _mm256_storeu_si256(compressed + 7, w1); + tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 9) ) ; + w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 23)); + w1 = _mm256_srli_epi32(tmp,9); + _mm256_storeu_si256(compressed + 8, w0); + tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 10) ) ; + w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 22)); + w0 = _mm256_srli_epi32(tmp,10); + _mm256_storeu_si256(compressed + 9, w1); + tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 11) ) ; + w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 21)); + w1 = _mm256_srli_epi32(tmp,11); + _mm256_storeu_si256(compressed + 10, w0); + tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 12) ) ; + w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 20)); + w0 = _mm256_srli_epi32(tmp,12); + _mm256_storeu_si256(compressed + 11, w1); + tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 13) ) ; + w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 19)); + w1 = _mm256_srli_epi32(tmp,13); + _mm256_storeu_si256(compressed + 12, w0); + tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 14) ) ; + w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 18)); + w0 = _mm256_srli_epi32(tmp,14); + _mm256_storeu_si256(compressed + 13, w1); + tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 15) ) ; + w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 17)); + w1 = _mm256_srli_epi32(tmp,15); + _mm256_storeu_si256(compressed + 14, w0); + tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 16) ) ; + w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 16)); + w0 = _mm256_srli_epi32(tmp,16); + _mm256_storeu_si256(compressed + 15, w1); + tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 17) ) ; + w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 15)); + w1 = _mm256_srli_epi32(tmp,17); + _mm256_storeu_si256(compressed + 16, w0); + tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 18) ) ; + w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 14)); + w0 = _mm256_srli_epi32(tmp,18); + _mm256_storeu_si256(compressed + 17, w1); + tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 19) ) ; + w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 13)); + w1 = _mm256_srli_epi32(tmp,19); + _mm256_storeu_si256(compressed + 18, w0); + tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 20) ) ; + w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 12)); + w0 = _mm256_srli_epi32(tmp,20); + _mm256_storeu_si256(compressed + 19, w1); + tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 21) ) ; + w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 11)); + w1 = _mm256_srli_epi32(tmp,21); + _mm256_storeu_si256(compressed + 20, w0); + tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 22) ) ; + w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 10)); + w0 = _mm256_srli_epi32(tmp,22); + _mm256_storeu_si256(compressed + 21, w1); + tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 23) ) ; + w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 9)); + w1 = _mm256_srli_epi32(tmp,23); + _mm256_storeu_si256(compressed + 22, w0); + tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 24) ) ; + w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 8)); + w0 = _mm256_srli_epi32(tmp,24); + _mm256_storeu_si256(compressed + 23, w1); + tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 25) ) ; + w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 7)); + w1 = _mm256_srli_epi32(tmp,25); + _mm256_storeu_si256(compressed + 24, w0); + tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 26) ) ; + w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 6)); + w0 = _mm256_srli_epi32(tmp,26); + _mm256_storeu_si256(compressed + 25, w1); + tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 27) ) ; + w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 5)); + w1 = _mm256_srli_epi32(tmp,27); + _mm256_storeu_si256(compressed + 26, w0); + tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 28) ) ; + w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 4)); + w0 = _mm256_srli_epi32(tmp,28); + _mm256_storeu_si256(compressed + 27, w1); + tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 29) ) ; + w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 3)); + w1 = _mm256_srli_epi32(tmp,29); + _mm256_storeu_si256(compressed + 28, w0); + tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 30) ) ; + w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 2)); + w0 = _mm256_srli_epi32(tmp,30); + _mm256_storeu_si256(compressed + 29, w1); + w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 31) ) , 1)); + _mm256_storeu_si256(compressed + 30, w0); +} + + +/* we are going to pack 256 32-bit values, touching 32 256-bit words, using 512 bytes */ +static void avxpackblockmask32(const uint32_t * pin, __m256i * compressed) { + /* we are going to touch 32 256-bit words */ + __m256i w0, w1; + const __m256i * in = (const __m256i *) pin; + w0 = _mm256_lddqu_si256 (in + 0) ; + _mm256_storeu_si256(compressed + 0, w0); + w1 = _mm256_lddqu_si256 (in + 1) ; + _mm256_storeu_si256(compressed + 1, w1); + w0 = _mm256_lddqu_si256 (in + 2) ; + _mm256_storeu_si256(compressed + 2, w0); + w1 = _mm256_lddqu_si256 (in + 3) ; + _mm256_storeu_si256(compressed + 3, w1); + w0 = _mm256_lddqu_si256 (in + 4) ; + _mm256_storeu_si256(compressed + 4, w0); + w1 = _mm256_lddqu_si256 (in + 5) ; + _mm256_storeu_si256(compressed + 5, w1); + w0 = _mm256_lddqu_si256 (in + 6) ; + _mm256_storeu_si256(compressed + 6, w0); + w1 = _mm256_lddqu_si256 (in + 7) ; + _mm256_storeu_si256(compressed + 7, w1); + w0 = _mm256_lddqu_si256 (in + 8) ; + _mm256_storeu_si256(compressed + 8, w0); + w1 = _mm256_lddqu_si256 (in + 9) ; + _mm256_storeu_si256(compressed + 9, w1); + w0 = _mm256_lddqu_si256 (in + 10) ; + _mm256_storeu_si256(compressed + 10, w0); + w1 = _mm256_lddqu_si256 (in + 11) ; + _mm256_storeu_si256(compressed + 11, w1); + w0 = _mm256_lddqu_si256 (in + 12) ; + _mm256_storeu_si256(compressed + 12, w0); + w1 = _mm256_lddqu_si256 (in + 13) ; + _mm256_storeu_si256(compressed + 13, w1); + w0 = _mm256_lddqu_si256 (in + 14) ; + _mm256_storeu_si256(compressed + 14, w0); + w1 = _mm256_lddqu_si256 (in + 15) ; + _mm256_storeu_si256(compressed + 15, w1); + w0 = _mm256_lddqu_si256 (in + 16) ; + _mm256_storeu_si256(compressed + 16, w0); + w1 = _mm256_lddqu_si256 (in + 17) ; + _mm256_storeu_si256(compressed + 17, w1); + w0 = _mm256_lddqu_si256 (in + 18) ; + _mm256_storeu_si256(compressed + 18, w0); + w1 = _mm256_lddqu_si256 (in + 19) ; + _mm256_storeu_si256(compressed + 19, w1); + w0 = _mm256_lddqu_si256 (in + 20) ; + _mm256_storeu_si256(compressed + 20, w0); + w1 = _mm256_lddqu_si256 (in + 21) ; + _mm256_storeu_si256(compressed + 21, w1); + w0 = _mm256_lddqu_si256 (in + 22) ; + _mm256_storeu_si256(compressed + 22, w0); + w1 = _mm256_lddqu_si256 (in + 23) ; + _mm256_storeu_si256(compressed + 23, w1); + w0 = _mm256_lddqu_si256 (in + 24) ; + _mm256_storeu_si256(compressed + 24, w0); + w1 = _mm256_lddqu_si256 (in + 25) ; + _mm256_storeu_si256(compressed + 25, w1); + w0 = _mm256_lddqu_si256 (in + 26) ; + _mm256_storeu_si256(compressed + 26, w0); + w1 = _mm256_lddqu_si256 (in + 27) ; + _mm256_storeu_si256(compressed + 27, w1); + w0 = _mm256_lddqu_si256 (in + 28) ; + _mm256_storeu_si256(compressed + 28, w0); + w1 = _mm256_lddqu_si256 (in + 29) ; + _mm256_storeu_si256(compressed + 29, w1); + w0 = _mm256_lddqu_si256 (in + 30) ; + _mm256_storeu_si256(compressed + 30, w0); + w1 = _mm256_lddqu_si256 (in + 31) ; + _mm256_storeu_si256(compressed + 31, w1); +} + +static void avxunpackblock0(const __m256i * compressed, uint32_t * pout) { + (void) compressed; + memset(pout,0,256); +} + + +/* we packed 256 1-bit values, touching 1 256-bit words, using 16 bytes */ +static void avxunpackblock1(const __m256i * compressed, uint32_t * pout) { + /* we are going to access 1 256-bit word */ + __m256i w0; + __m256i * out = (__m256i *) pout; + const __m256i mask = _mm256_set1_epi32(1); + w0 = _mm256_lddqu_si256 (compressed); + _mm256_storeu_si256(out + 0, _mm256_and_si256 ( mask, w0 ) ); + _mm256_storeu_si256(out + 1, _mm256_and_si256 ( mask, _mm256_srli_epi32( w0 , 1) ) ); + _mm256_storeu_si256(out + 2, _mm256_and_si256 ( mask, _mm256_srli_epi32( w0 , 2) ) ); + _mm256_storeu_si256(out + 3, _mm256_and_si256 ( mask, _mm256_srli_epi32( w0 , 3) ) ); + _mm256_storeu_si256(out + 4, _mm256_and_si256 ( mask, _mm256_srli_epi32( w0 , 4) ) ); + _mm256_storeu_si256(out + 5, _mm256_and_si256 ( mask, _mm256_srli_epi32( w0 , 5) ) ); + _mm256_storeu_si256(out + 6, _mm256_and_si256 ( mask, _mm256_srli_epi32( w0 , 6) ) ); + _mm256_storeu_si256(out + 7, _mm256_and_si256 ( mask, _mm256_srli_epi32( w0 , 7) ) ); + _mm256_storeu_si256(out + 8, _mm256_and_si256 ( mask, _mm256_srli_epi32( w0 , 8) ) ); + _mm256_storeu_si256(out + 9, _mm256_and_si256 ( mask, _mm256_srli_epi32( w0 , 9) ) ); + _mm256_storeu_si256(out + 10, _mm256_and_si256 ( mask, _mm256_srli_epi32( w0 , 10) ) ); + _mm256_storeu_si256(out + 11, _mm256_and_si256 ( mask, _mm256_srli_epi32( w0 , 11) ) ); + _mm256_storeu_si256(out + 12, _mm256_and_si256 ( mask, _mm256_srli_epi32( w0 , 12) ) ); + _mm256_storeu_si256(out + 13, _mm256_and_si256 ( mask, _mm256_srli_epi32( w0 , 13) ) ); + _mm256_storeu_si256(out + 14, _mm256_and_si256 ( mask, _mm256_srli_epi32( w0 , 14) ) ); + _mm256_storeu_si256(out + 15, _mm256_and_si256 ( mask, _mm256_srli_epi32( w0 , 15) ) ); + _mm256_storeu_si256(out + 16, _mm256_and_si256 ( mask, _mm256_srli_epi32( w0 , 16) ) ); + _mm256_storeu_si256(out + 17, _mm256_and_si256 ( mask, _mm256_srli_epi32( w0 , 17) ) ); + _mm256_storeu_si256(out + 18, _mm256_and_si256 ( mask, _mm256_srli_epi32( w0 , 18) ) ); + _mm256_storeu_si256(out + 19, _mm256_and_si256 ( mask, _mm256_srli_epi32( w0 , 19) ) ); + _mm256_storeu_si256(out + 20, _mm256_and_si256 ( mask, _mm256_srli_epi32( w0 , 20) ) ); + _mm256_storeu_si256(out + 21, _mm256_and_si256 ( mask, _mm256_srli_epi32( w0 , 21) ) ); + _mm256_storeu_si256(out + 22, _mm256_and_si256 ( mask, _mm256_srli_epi32( w0 , 22) ) ); + _mm256_storeu_si256(out + 23, _mm256_and_si256 ( mask, _mm256_srli_epi32( w0 , 23) ) ); + _mm256_storeu_si256(out + 24, _mm256_and_si256 ( mask, _mm256_srli_epi32( w0 , 24) ) ); + _mm256_storeu_si256(out + 25, _mm256_and_si256 ( mask, _mm256_srli_epi32( w0 , 25) ) ); + _mm256_storeu_si256(out + 26, _mm256_and_si256 ( mask, _mm256_srli_epi32( w0 , 26) ) ); + _mm256_storeu_si256(out + 27, _mm256_and_si256 ( mask, _mm256_srli_epi32( w0 , 27) ) ); + _mm256_storeu_si256(out + 28, _mm256_and_si256 ( mask, _mm256_srli_epi32( w0 , 28) ) ); + _mm256_storeu_si256(out + 29, _mm256_and_si256 ( mask, _mm256_srli_epi32( w0 , 29) ) ); + _mm256_storeu_si256(out + 30, _mm256_and_si256 ( mask, _mm256_srli_epi32( w0 , 30) ) ); + _mm256_storeu_si256(out + 31, _mm256_srli_epi32( w0 , 31) ); +} + + +/* we packed 256 2-bit values, touching 2 256-bit words, using 32 bytes */ +static void avxunpackblock2(const __m256i * compressed, uint32_t * pout) { + /* we are going to access 2 256-bit words */ + __m256i w0, w1; + __m256i * out = (__m256i *) pout; + const __m256i mask = _mm256_set1_epi32(3); + w0 = _mm256_lddqu_si256 (compressed); + _mm256_storeu_si256(out + 0, _mm256_and_si256 ( mask, w0 ) ); + _mm256_storeu_si256(out + 1, _mm256_and_si256 ( mask, _mm256_srli_epi32( w0 , 2) ) ); + _mm256_storeu_si256(out + 2, _mm256_and_si256 ( mask, _mm256_srli_epi32( w0 , 4) ) ); + _mm256_storeu_si256(out + 3, _mm256_and_si256 ( mask, _mm256_srli_epi32( w0 , 6) ) ); + _mm256_storeu_si256(out + 4, _mm256_and_si256 ( mask, _mm256_srli_epi32( w0 , 8) ) ); + _mm256_storeu_si256(out + 5, _mm256_and_si256 ( mask, _mm256_srli_epi32( w0 , 10) ) ); + _mm256_storeu_si256(out + 6, _mm256_and_si256 ( mask, _mm256_srli_epi32( w0 , 12) ) ); + _mm256_storeu_si256(out + 7, _mm256_and_si256 ( mask, _mm256_srli_epi32( w0 , 14) ) ); + _mm256_storeu_si256(out + 8, _mm256_and_si256 ( mask, _mm256_srli_epi32( w0 , 16) ) ); + _mm256_storeu_si256(out + 9, _mm256_and_si256 ( mask, _mm256_srli_epi32( w0 , 18) ) ); + _mm256_storeu_si256(out + 10, _mm256_and_si256 ( mask, _mm256_srli_epi32( w0 , 20) ) ); + _mm256_storeu_si256(out + 11, _mm256_and_si256 ( mask, _mm256_srli_epi32( w0 , 22) ) ); + _mm256_storeu_si256(out + 12, _mm256_and_si256 ( mask, _mm256_srli_epi32( w0 , 24) ) ); + _mm256_storeu_si256(out + 13, _mm256_and_si256 ( mask, _mm256_srli_epi32( w0 , 26) ) ); + _mm256_storeu_si256(out + 14, _mm256_and_si256 ( mask, _mm256_srli_epi32( w0 , 28) ) ); + _mm256_storeu_si256(out + 15, _mm256_srli_epi32( w0 , 30) ); + w1 = _mm256_lddqu_si256 (compressed + 1); + _mm256_storeu_si256(out + 16, _mm256_and_si256 ( mask, w1 ) ); + _mm256_storeu_si256(out + 17, _mm256_and_si256 ( mask, _mm256_srli_epi32( w1 , 2) ) ); + _mm256_storeu_si256(out + 18, _mm256_and_si256 ( mask, _mm256_srli_epi32( w1 , 4) ) ); + _mm256_storeu_si256(out + 19, _mm256_and_si256 ( mask, _mm256_srli_epi32( w1 , 6) ) ); + _mm256_storeu_si256(out + 20, _mm256_and_si256 ( mask, _mm256_srli_epi32( w1 , 8) ) ); + _mm256_storeu_si256(out + 21, _mm256_and_si256 ( mask, _mm256_srli_epi32( w1 , 10) ) ); + _mm256_storeu_si256(out + 22, _mm256_and_si256 ( mask, _mm256_srli_epi32( w1 , 12) ) ); + _mm256_storeu_si256(out + 23, _mm256_and_si256 ( mask, _mm256_srli_epi32( w1 , 14) ) ); + _mm256_storeu_si256(out + 24, _mm256_and_si256 ( mask, _mm256_srli_epi32( w1 , 16) ) ); + _mm256_storeu_si256(out + 25, _mm256_and_si256 ( mask, _mm256_srli_epi32( w1 , 18) ) ); + _mm256_storeu_si256(out + 26, _mm256_and_si256 ( mask, _mm256_srli_epi32( w1 , 20) ) ); + _mm256_storeu_si256(out + 27, _mm256_and_si256 ( mask, _mm256_srli_epi32( w1 , 22) ) ); + _mm256_storeu_si256(out + 28, _mm256_and_si256 ( mask, _mm256_srli_epi32( w1 , 24) ) ); + _mm256_storeu_si256(out + 29, _mm256_and_si256 ( mask, _mm256_srli_epi32( w1 , 26) ) ); + _mm256_storeu_si256(out + 30, _mm256_and_si256 ( mask, _mm256_srli_epi32( w1 , 28) ) ); + _mm256_storeu_si256(out + 31, _mm256_srli_epi32( w1 , 30) ); +} + + +/* we packed 256 3-bit values, touching 3 256-bit words, using 48 bytes */ +static void avxunpackblock3(const __m256i * compressed, uint32_t * pout) { + /* we are going to access 3 256-bit words */ + __m256i w0, w1; + __m256i * out = (__m256i *) pout; + const __m256i mask = _mm256_set1_epi32(7); + w0 = _mm256_lddqu_si256 (compressed); + _mm256_storeu_si256(out + 0, _mm256_and_si256 ( mask, w0 ) ); + _mm256_storeu_si256(out + 1, _mm256_and_si256 ( mask, _mm256_srli_epi32( w0 , 3) ) ); + _mm256_storeu_si256(out + 2, _mm256_and_si256 ( mask, _mm256_srli_epi32( w0 , 6) ) ); + _mm256_storeu_si256(out + 3, _mm256_and_si256 ( mask, _mm256_srli_epi32( w0 , 9) ) ); + _mm256_storeu_si256(out + 4, _mm256_and_si256 ( mask, _mm256_srli_epi32( w0 , 12) ) ); + _mm256_storeu_si256(out + 5, _mm256_and_si256 ( mask, _mm256_srli_epi32( w0 , 15) ) ); + _mm256_storeu_si256(out + 6, _mm256_and_si256 ( mask, _mm256_srli_epi32( w0 , 18) ) ); + _mm256_storeu_si256(out + 7, _mm256_and_si256 ( mask, _mm256_srli_epi32( w0 , 21) ) ); + _mm256_storeu_si256(out + 8, _mm256_and_si256 ( mask, _mm256_srli_epi32( w0 , 24) ) ); + _mm256_storeu_si256(out + 9, _mm256_and_si256 ( mask, _mm256_srli_epi32( w0 , 27) ) ); + w1 = _mm256_lddqu_si256 (compressed + 1); + _mm256_storeu_si256(out + 10, + _mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w0 , 30) ,_mm256_slli_epi32( w1 , 2 ) ) ) ); + _mm256_storeu_si256(out + 11, _mm256_and_si256 ( mask, _mm256_srli_epi32( w1 , 1) ) ); + _mm256_storeu_si256(out + 12, _mm256_and_si256 ( mask, _mm256_srli_epi32( w1 , 4) ) ); + _mm256_storeu_si256(out + 13, _mm256_and_si256 ( mask, _mm256_srli_epi32( w1 , 7) ) ); + _mm256_storeu_si256(out + 14, _mm256_and_si256 ( mask, _mm256_srli_epi32( w1 , 10) ) ); + _mm256_storeu_si256(out + 15, _mm256_and_si256 ( mask, _mm256_srli_epi32( w1 , 13) ) ); + _mm256_storeu_si256(out + 16, _mm256_and_si256 ( mask, _mm256_srli_epi32( w1 , 16) ) ); + _mm256_storeu_si256(out + 17, _mm256_and_si256 ( mask, _mm256_srli_epi32( w1 , 19) ) ); + _mm256_storeu_si256(out + 18, _mm256_and_si256 ( mask, _mm256_srli_epi32( w1 , 22) ) ); + _mm256_storeu_si256(out + 19, _mm256_and_si256 ( mask, _mm256_srli_epi32( w1 , 25) ) ); + _mm256_storeu_si256(out + 20, _mm256_and_si256 ( mask, _mm256_srli_epi32( w1 , 28) ) ); + w0 = _mm256_lddqu_si256 (compressed + 2); + _mm256_storeu_si256(out + 21, + _mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w1 , 31) ,_mm256_slli_epi32( w0 , 1 ) ) ) ); + _mm256_storeu_si256(out + 22, _mm256_and_si256 ( mask, _mm256_srli_epi32( w0 , 2) ) ); + _mm256_storeu_si256(out + 23, _mm256_and_si256 ( mask, _mm256_srli_epi32( w0 , 5) ) ); + _mm256_storeu_si256(out + 24, _mm256_and_si256 ( mask, _mm256_srli_epi32( w0 , 8) ) ); + _mm256_storeu_si256(out + 25, _mm256_and_si256 ( mask, _mm256_srli_epi32( w0 , 11) ) ); + _mm256_storeu_si256(out + 26, _mm256_and_si256 ( mask, _mm256_srli_epi32( w0 , 14) ) ); + _mm256_storeu_si256(out + 27, _mm256_and_si256 ( mask, _mm256_srli_epi32( w0 , 17) ) ); + _mm256_storeu_si256(out + 28, _mm256_and_si256 ( mask, _mm256_srli_epi32( w0 , 20) ) ); + _mm256_storeu_si256(out + 29, _mm256_and_si256 ( mask, _mm256_srli_epi32( w0 , 23) ) ); + _mm256_storeu_si256(out + 30, _mm256_and_si256 ( mask, _mm256_srli_epi32( w0 , 26) ) ); + _mm256_storeu_si256(out + 31, _mm256_srli_epi32( w0 , 29) ); +} + + +/* we packed 256 4-bit values, touching 4 256-bit words, using 64 bytes */ +static void avxunpackblock4(const __m256i * compressed, uint32_t * pout) { + /* we are going to access 4 256-bit words */ + __m256i w0, w1; + __m256i * out = (__m256i *) pout; + const __m256i mask = _mm256_set1_epi32(15); + w0 = _mm256_lddqu_si256 (compressed); + _mm256_storeu_si256(out + 0, _mm256_and_si256 ( mask, w0 ) ); + _mm256_storeu_si256(out + 1, _mm256_and_si256 ( mask, _mm256_srli_epi32( w0 , 4) ) ); + _mm256_storeu_si256(out + 2, _mm256_and_si256 ( mask, _mm256_srli_epi32( w0 , 8) ) ); + _mm256_storeu_si256(out + 3, _mm256_and_si256 ( mask, _mm256_srli_epi32( w0 , 12) ) ); + _mm256_storeu_si256(out + 4, _mm256_and_si256 ( mask, _mm256_srli_epi32( w0 , 16) ) ); + _mm256_storeu_si256(out + 5, _mm256_and_si256 ( mask, _mm256_srli_epi32( w0 , 20) ) ); + _mm256_storeu_si256(out + 6, _mm256_and_si256 ( mask, _mm256_srli_epi32( w0 , 24) ) ); + _mm256_storeu_si256(out + 7, _mm256_srli_epi32( w0 , 28) ); + w1 = _mm256_lddqu_si256 (compressed + 1); + _mm256_storeu_si256(out + 8, _mm256_and_si256 ( mask, w1 ) ); + _mm256_storeu_si256(out + 9, _mm256_and_si256 ( mask, _mm256_srli_epi32( w1 , 4) ) ); + _mm256_storeu_si256(out + 10, _mm256_and_si256 ( mask, _mm256_srli_epi32( w1 , 8) ) ); + _mm256_storeu_si256(out + 11, _mm256_and_si256 ( mask, _mm256_srli_epi32( w1 , 12) ) ); + _mm256_storeu_si256(out + 12, _mm256_and_si256 ( mask, _mm256_srli_epi32( w1 , 16) ) ); + _mm256_storeu_si256(out + 13, _mm256_and_si256 ( mask, _mm256_srli_epi32( w1 , 20) ) ); + _mm256_storeu_si256(out + 14, _mm256_and_si256 ( mask, _mm256_srli_epi32( w1 , 24) ) ); + _mm256_storeu_si256(out + 15, _mm256_srli_epi32( w1 , 28) ); + w0 = _mm256_lddqu_si256 (compressed + 2); + _mm256_storeu_si256(out + 16, _mm256_and_si256 ( mask, w0 ) ); + _mm256_storeu_si256(out + 17, _mm256_and_si256 ( mask, _mm256_srli_epi32( w0 , 4) ) ); + _mm256_storeu_si256(out + 18, _mm256_and_si256 ( mask, _mm256_srli_epi32( w0 , 8) ) ); + _mm256_storeu_si256(out + 19, _mm256_and_si256 ( mask, _mm256_srli_epi32( w0 , 12) ) ); + _mm256_storeu_si256(out + 20, _mm256_and_si256 ( mask, _mm256_srli_epi32( w0 , 16) ) ); + _mm256_storeu_si256(out + 21, _mm256_and_si256 ( mask, _mm256_srli_epi32( w0 , 20) ) ); + _mm256_storeu_si256(out + 22, _mm256_and_si256 ( mask, _mm256_srli_epi32( w0 , 24) ) ); + _mm256_storeu_si256(out + 23, _mm256_srli_epi32( w0 , 28) ); + w1 = _mm256_lddqu_si256 (compressed + 3); + _mm256_storeu_si256(out + 24, _mm256_and_si256 ( mask, w1 ) ); + _mm256_storeu_si256(out + 25, _mm256_and_si256 ( mask, _mm256_srli_epi32( w1 , 4) ) ); + _mm256_storeu_si256(out + 26, _mm256_and_si256 ( mask, _mm256_srli_epi32( w1 , 8) ) ); + _mm256_storeu_si256(out + 27, _mm256_and_si256 ( mask, _mm256_srli_epi32( w1 , 12) ) ); + _mm256_storeu_si256(out + 28, _mm256_and_si256 ( mask, _mm256_srli_epi32( w1 , 16) ) ); + _mm256_storeu_si256(out + 29, _mm256_and_si256 ( mask, _mm256_srli_epi32( w1 , 20) ) ); + _mm256_storeu_si256(out + 30, _mm256_and_si256 ( mask, _mm256_srli_epi32( w1 , 24) ) ); + _mm256_storeu_si256(out + 31, _mm256_srli_epi32( w1 , 28) ); +} + + +/* we packed 256 5-bit values, touching 5 256-bit words, using 80 bytes */ +static void avxunpackblock5(const __m256i * compressed, uint32_t * pout) { + /* we are going to access 5 256-bit words */ + __m256i w0, w1; + __m256i * out = (__m256i *) pout; + const __m256i mask = _mm256_set1_epi32(31); + w0 = _mm256_lddqu_si256 (compressed); + _mm256_storeu_si256(out + 0, _mm256_and_si256 ( mask, w0 ) ); + _mm256_storeu_si256(out + 1, _mm256_and_si256 ( mask, _mm256_srli_epi32( w0 , 5) ) ); + _mm256_storeu_si256(out + 2, _mm256_and_si256 ( mask, _mm256_srli_epi32( w0 , 10) ) ); + _mm256_storeu_si256(out + 3, _mm256_and_si256 ( mask, _mm256_srli_epi32( w0 , 15) ) ); + _mm256_storeu_si256(out + 4, _mm256_and_si256 ( mask, _mm256_srli_epi32( w0 , 20) ) ); + _mm256_storeu_si256(out + 5, _mm256_and_si256 ( mask, _mm256_srli_epi32( w0 , 25) ) ); + w1 = _mm256_lddqu_si256 (compressed + 1); + _mm256_storeu_si256(out + 6, + _mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w0 , 30) ,_mm256_slli_epi32( w1 , 2 ) ) ) ); + _mm256_storeu_si256(out + 7, _mm256_and_si256 ( mask, _mm256_srli_epi32( w1 , 3) ) ); + _mm256_storeu_si256(out + 8, _mm256_and_si256 ( mask, _mm256_srli_epi32( w1 , 8) ) ); + _mm256_storeu_si256(out + 9, _mm256_and_si256 ( mask, _mm256_srli_epi32( w1 , 13) ) ); + _mm256_storeu_si256(out + 10, _mm256_and_si256 ( mask, _mm256_srli_epi32( w1 , 18) ) ); + _mm256_storeu_si256(out + 11, _mm256_and_si256 ( mask, _mm256_srli_epi32( w1 , 23) ) ); + w0 = _mm256_lddqu_si256 (compressed + 2); + _mm256_storeu_si256(out + 12, + _mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w1 , 28) ,_mm256_slli_epi32( w0 , 4 ) ) ) ); + _mm256_storeu_si256(out + 13, _mm256_and_si256 ( mask, _mm256_srli_epi32( w0 , 1) ) ); + _mm256_storeu_si256(out + 14, _mm256_and_si256 ( mask, _mm256_srli_epi32( w0 , 6) ) ); + _mm256_storeu_si256(out + 15, _mm256_and_si256 ( mask, _mm256_srli_epi32( w0 , 11) ) ); + _mm256_storeu_si256(out + 16, _mm256_and_si256 ( mask, _mm256_srli_epi32( w0 , 16) ) ); + _mm256_storeu_si256(out + 17, _mm256_and_si256 ( mask, _mm256_srli_epi32( w0 , 21) ) ); + _mm256_storeu_si256(out + 18, _mm256_and_si256 ( mask, _mm256_srli_epi32( w0 , 26) ) ); + w1 = _mm256_lddqu_si256 (compressed + 3); + _mm256_storeu_si256(out + 19, + _mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w0 , 31) ,_mm256_slli_epi32( w1 , 1 ) ) ) ); + _mm256_storeu_si256(out + 20, _mm256_and_si256 ( mask, _mm256_srli_epi32( w1 , 4) ) ); + _mm256_storeu_si256(out + 21, _mm256_and_si256 ( mask, _mm256_srli_epi32( w1 , 9) ) ); + _mm256_storeu_si256(out + 22, _mm256_and_si256 ( mask, _mm256_srli_epi32( w1 , 14) ) ); + _mm256_storeu_si256(out + 23, _mm256_and_si256 ( mask, _mm256_srli_epi32( w1 , 19) ) ); + _mm256_storeu_si256(out + 24, _mm256_and_si256 ( mask, _mm256_srli_epi32( w1 , 24) ) ); + w0 = _mm256_lddqu_si256 (compressed + 4); + _mm256_storeu_si256(out + 25, + _mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w1 , 29) ,_mm256_slli_epi32( w0 , 3 ) ) ) ); + _mm256_storeu_si256(out + 26, _mm256_and_si256 ( mask, _mm256_srli_epi32( w0 , 2) ) ); + _mm256_storeu_si256(out + 27, _mm256_and_si256 ( mask, _mm256_srli_epi32( w0 , 7) ) ); + _mm256_storeu_si256(out + 28, _mm256_and_si256 ( mask, _mm256_srli_epi32( w0 , 12) ) ); + _mm256_storeu_si256(out + 29, _mm256_and_si256 ( mask, _mm256_srli_epi32( w0 , 17) ) ); + _mm256_storeu_si256(out + 30, _mm256_and_si256 ( mask, _mm256_srli_epi32( w0 , 22) ) ); + _mm256_storeu_si256(out + 31, _mm256_srli_epi32( w0 , 27) ); +} + + +/* we packed 256 6-bit values, touching 6 256-bit words, using 96 bytes */ +static void avxunpackblock6(const __m256i * compressed, uint32_t * pout) { + /* we are going to access 6 256-bit words */ + __m256i w0, w1; + __m256i * out = (__m256i *) pout; + const __m256i mask = _mm256_set1_epi32(63); + w0 = _mm256_lddqu_si256 (compressed); + _mm256_storeu_si256(out + 0, _mm256_and_si256 ( mask, w0 ) ); + _mm256_storeu_si256(out + 1, _mm256_and_si256 ( mask, _mm256_srli_epi32( w0 , 6) ) ); + _mm256_storeu_si256(out + 2, _mm256_and_si256 ( mask, _mm256_srli_epi32( w0 , 12) ) ); + _mm256_storeu_si256(out + 3, _mm256_and_si256 ( mask, _mm256_srli_epi32( w0 , 18) ) ); + _mm256_storeu_si256(out + 4, _mm256_and_si256 ( mask, _mm256_srli_epi32( w0 , 24) ) ); + w1 = _mm256_lddqu_si256 (compressed + 1); + _mm256_storeu_si256(out + 5, + _mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w0 , 30) ,_mm256_slli_epi32( w1 , 2 ) ) ) ); + _mm256_storeu_si256(out + 6, _mm256_and_si256 ( mask, _mm256_srli_epi32( w1 , 4) ) ); + _mm256_storeu_si256(out + 7, _mm256_and_si256 ( mask, _mm256_srli_epi32( w1 , 10) ) ); + _mm256_storeu_si256(out + 8, _mm256_and_si256 ( mask, _mm256_srli_epi32( w1 , 16) ) ); + _mm256_storeu_si256(out + 9, _mm256_and_si256 ( mask, _mm256_srli_epi32( w1 , 22) ) ); + w0 = _mm256_lddqu_si256 (compressed + 2); + _mm256_storeu_si256(out + 10, + _mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w1 , 28) ,_mm256_slli_epi32( w0 , 4 ) ) ) ); + _mm256_storeu_si256(out + 11, _mm256_and_si256 ( mask, _mm256_srli_epi32( w0 , 2) ) ); + _mm256_storeu_si256(out + 12, _mm256_and_si256 ( mask, _mm256_srli_epi32( w0 , 8) ) ); + _mm256_storeu_si256(out + 13, _mm256_and_si256 ( mask, _mm256_srli_epi32( w0 , 14) ) ); + _mm256_storeu_si256(out + 14, _mm256_and_si256 ( mask, _mm256_srli_epi32( w0 , 20) ) ); + _mm256_storeu_si256(out + 15, _mm256_srli_epi32( w0 , 26) ); + w1 = _mm256_lddqu_si256 (compressed + 3); + _mm256_storeu_si256(out + 16, _mm256_and_si256 ( mask, w1 ) ); + _mm256_storeu_si256(out + 17, _mm256_and_si256 ( mask, _mm256_srli_epi32( w1 , 6) ) ); + _mm256_storeu_si256(out + 18, _mm256_and_si256 ( mask, _mm256_srli_epi32( w1 , 12) ) ); + _mm256_storeu_si256(out + 19, _mm256_and_si256 ( mask, _mm256_srli_epi32( w1 , 18) ) ); + _mm256_storeu_si256(out + 20, _mm256_and_si256 ( mask, _mm256_srli_epi32( w1 , 24) ) ); + w0 = _mm256_lddqu_si256 (compressed + 4); + _mm256_storeu_si256(out + 21, + _mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w1 , 30) ,_mm256_slli_epi32( w0 , 2 ) ) ) ); + _mm256_storeu_si256(out + 22, _mm256_and_si256 ( mask, _mm256_srli_epi32( w0 , 4) ) ); + _mm256_storeu_si256(out + 23, _mm256_and_si256 ( mask, _mm256_srli_epi32( w0 , 10) ) ); + _mm256_storeu_si256(out + 24, _mm256_and_si256 ( mask, _mm256_srli_epi32( w0 , 16) ) ); + _mm256_storeu_si256(out + 25, _mm256_and_si256 ( mask, _mm256_srli_epi32( w0 , 22) ) ); + w1 = _mm256_lddqu_si256 (compressed + 5); + _mm256_storeu_si256(out + 26, + _mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w0 , 28) ,_mm256_slli_epi32( w1 , 4 ) ) ) ); + _mm256_storeu_si256(out + 27, _mm256_and_si256 ( mask, _mm256_srli_epi32( w1 , 2) ) ); + _mm256_storeu_si256(out + 28, _mm256_and_si256 ( mask, _mm256_srli_epi32( w1 , 8) ) ); + _mm256_storeu_si256(out + 29, _mm256_and_si256 ( mask, _mm256_srli_epi32( w1 , 14) ) ); + _mm256_storeu_si256(out + 30, _mm256_and_si256 ( mask, _mm256_srli_epi32( w1 , 20) ) ); + _mm256_storeu_si256(out + 31, _mm256_srli_epi32( w1 , 26) ); +} + + +/* we packed 256 7-bit values, touching 7 256-bit words, using 112 bytes */ +static void avxunpackblock7(const __m256i * compressed, uint32_t * pout) { + /* we are going to access 7 256-bit words */ + __m256i w0, w1; + __m256i * out = (__m256i *) pout; + const __m256i mask = _mm256_set1_epi32(127); + w0 = _mm256_lddqu_si256 (compressed); + _mm256_storeu_si256(out + 0, _mm256_and_si256 ( mask, w0 ) ); + _mm256_storeu_si256(out + 1, _mm256_and_si256 ( mask, _mm256_srli_epi32( w0 , 7) ) ); + _mm256_storeu_si256(out + 2, _mm256_and_si256 ( mask, _mm256_srli_epi32( w0 , 14) ) ); + _mm256_storeu_si256(out + 3, _mm256_and_si256 ( mask, _mm256_srli_epi32( w0 , 21) ) ); + w1 = _mm256_lddqu_si256 (compressed + 1); + _mm256_storeu_si256(out + 4, + _mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w0 , 28) ,_mm256_slli_epi32( w1 , 4 ) ) ) ); + _mm256_storeu_si256(out + 5, _mm256_and_si256 ( mask, _mm256_srli_epi32( w1 , 3) ) ); + _mm256_storeu_si256(out + 6, _mm256_and_si256 ( mask, _mm256_srli_epi32( w1 , 10) ) ); + _mm256_storeu_si256(out + 7, _mm256_and_si256 ( mask, _mm256_srli_epi32( w1 , 17) ) ); + _mm256_storeu_si256(out + 8, _mm256_and_si256 ( mask, _mm256_srli_epi32( w1 , 24) ) ); + w0 = _mm256_lddqu_si256 (compressed + 2); + _mm256_storeu_si256(out + 9, + _mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w1 , 31) ,_mm256_slli_epi32( w0 , 1 ) ) ) ); + _mm256_storeu_si256(out + 10, _mm256_and_si256 ( mask, _mm256_srli_epi32( w0 , 6) ) ); + _mm256_storeu_si256(out + 11, _mm256_and_si256 ( mask, _mm256_srli_epi32( w0 , 13) ) ); + _mm256_storeu_si256(out + 12, _mm256_and_si256 ( mask, _mm256_srli_epi32( w0 , 20) ) ); + w1 = _mm256_lddqu_si256 (compressed + 3); + _mm256_storeu_si256(out + 13, + _mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w0 , 27) ,_mm256_slli_epi32( w1 , 5 ) ) ) ); + _mm256_storeu_si256(out + 14, _mm256_and_si256 ( mask, _mm256_srli_epi32( w1 , 2) ) ); + _mm256_storeu_si256(out + 15, _mm256_and_si256 ( mask, _mm256_srli_epi32( w1 , 9) ) ); + _mm256_storeu_si256(out + 16, _mm256_and_si256 ( mask, _mm256_srli_epi32( w1 , 16) ) ); + _mm256_storeu_si256(out + 17, _mm256_and_si256 ( mask, _mm256_srli_epi32( w1 , 23) ) ); + w0 = _mm256_lddqu_si256 (compressed + 4); + _mm256_storeu_si256(out + 18, + _mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w1 , 30) ,_mm256_slli_epi32( w0 , 2 ) ) ) ); + _mm256_storeu_si256(out + 19, _mm256_and_si256 ( mask, _mm256_srli_epi32( w0 , 5) ) ); + _mm256_storeu_si256(out + 20, _mm256_and_si256 ( mask, _mm256_srli_epi32( w0 , 12) ) ); + _mm256_storeu_si256(out + 21, _mm256_and_si256 ( mask, _mm256_srli_epi32( w0 , 19) ) ); + w1 = _mm256_lddqu_si256 (compressed + 5); + _mm256_storeu_si256(out + 22, + _mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w0 , 26) ,_mm256_slli_epi32( w1 , 6 ) ) ) ); + _mm256_storeu_si256(out + 23, _mm256_and_si256 ( mask, _mm256_srli_epi32( w1 , 1) ) ); + _mm256_storeu_si256(out + 24, _mm256_and_si256 ( mask, _mm256_srli_epi32( w1 , 8) ) ); + _mm256_storeu_si256(out + 25, _mm256_and_si256 ( mask, _mm256_srli_epi32( w1 , 15) ) ); + _mm256_storeu_si256(out + 26, _mm256_and_si256 ( mask, _mm256_srli_epi32( w1 , 22) ) ); + w0 = _mm256_lddqu_si256 (compressed + 6); + _mm256_storeu_si256(out + 27, + _mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w1 , 29) ,_mm256_slli_epi32( w0 , 3 ) ) ) ); + _mm256_storeu_si256(out + 28, _mm256_and_si256 ( mask, _mm256_srli_epi32( w0 , 4) ) ); + _mm256_storeu_si256(out + 29, _mm256_and_si256 ( mask, _mm256_srli_epi32( w0 , 11) ) ); + _mm256_storeu_si256(out + 30, _mm256_and_si256 ( mask, _mm256_srli_epi32( w0 , 18) ) ); + _mm256_storeu_si256(out + 31, _mm256_srli_epi32( w0 , 25) ); +} + + +/* we packed 256 8-bit values, touching 8 256-bit words, using 128 bytes */ +static void avxunpackblock8(const __m256i * compressed, uint32_t * pout) { + /* we are going to access 8 256-bit words */ + __m256i w0, w1; + __m256i * out = (__m256i *) pout; + const __m256i mask = _mm256_set1_epi32(255); + w0 = _mm256_lddqu_si256 (compressed); + _mm256_storeu_si256(out + 0, _mm256_and_si256 ( mask, w0 ) ); + _mm256_storeu_si256(out + 1, _mm256_and_si256 ( mask, _mm256_srli_epi32( w0 , 8) ) ); + _mm256_storeu_si256(out + 2, _mm256_and_si256 ( mask, _mm256_srli_epi32( w0 , 16) ) ); + _mm256_storeu_si256(out + 3, _mm256_srli_epi32( w0 , 24) ); + w1 = _mm256_lddqu_si256 (compressed + 1); + _mm256_storeu_si256(out + 4, _mm256_and_si256 ( mask, w1 ) ); + _mm256_storeu_si256(out + 5, _mm256_and_si256 ( mask, _mm256_srli_epi32( w1 , 8) ) ); + _mm256_storeu_si256(out + 6, _mm256_and_si256 ( mask, _mm256_srli_epi32( w1 , 16) ) ); + _mm256_storeu_si256(out + 7, _mm256_srli_epi32( w1 , 24) ); + w0 = _mm256_lddqu_si256 (compressed + 2); + _mm256_storeu_si256(out + 8, _mm256_and_si256 ( mask, w0 ) ); + _mm256_storeu_si256(out + 9, _mm256_and_si256 ( mask, _mm256_srli_epi32( w0 , 8) ) ); + _mm256_storeu_si256(out + 10, _mm256_and_si256 ( mask, _mm256_srli_epi32( w0 , 16) ) ); + _mm256_storeu_si256(out + 11, _mm256_srli_epi32( w0 , 24) ); + w1 = _mm256_lddqu_si256 (compressed + 3); + _mm256_storeu_si256(out + 12, _mm256_and_si256 ( mask, w1 ) ); + _mm256_storeu_si256(out + 13, _mm256_and_si256 ( mask, _mm256_srli_epi32( w1 , 8) ) ); + _mm256_storeu_si256(out + 14, _mm256_and_si256 ( mask, _mm256_srli_epi32( w1 , 16) ) ); + _mm256_storeu_si256(out + 15, _mm256_srli_epi32( w1 , 24) ); + w0 = _mm256_lddqu_si256 (compressed + 4); + _mm256_storeu_si256(out + 16, _mm256_and_si256 ( mask, w0 ) ); + _mm256_storeu_si256(out + 17, _mm256_and_si256 ( mask, _mm256_srli_epi32( w0 , 8) ) ); + _mm256_storeu_si256(out + 18, _mm256_and_si256 ( mask, _mm256_srli_epi32( w0 , 16) ) ); + _mm256_storeu_si256(out + 19, _mm256_srli_epi32( w0 , 24) ); + w1 = _mm256_lddqu_si256 (compressed + 5); + _mm256_storeu_si256(out + 20, _mm256_and_si256 ( mask, w1 ) ); + _mm256_storeu_si256(out + 21, _mm256_and_si256 ( mask, _mm256_srli_epi32( w1 , 8) ) ); + _mm256_storeu_si256(out + 22, _mm256_and_si256 ( mask, _mm256_srli_epi32( w1 , 16) ) ); + _mm256_storeu_si256(out + 23, _mm256_srli_epi32( w1 , 24) ); + w0 = _mm256_lddqu_si256 (compressed + 6); + _mm256_storeu_si256(out + 24, _mm256_and_si256 ( mask, w0 ) ); + _mm256_storeu_si256(out + 25, _mm256_and_si256 ( mask, _mm256_srli_epi32( w0 , 8) ) ); + _mm256_storeu_si256(out + 26, _mm256_and_si256 ( mask, _mm256_srli_epi32( w0 , 16) ) ); + _mm256_storeu_si256(out + 27, _mm256_srli_epi32( w0 , 24) ); + w1 = _mm256_lddqu_si256 (compressed + 7); + _mm256_storeu_si256(out + 28, _mm256_and_si256 ( mask, w1 ) ); + _mm256_storeu_si256(out + 29, _mm256_and_si256 ( mask, _mm256_srli_epi32( w1 , 8) ) ); + _mm256_storeu_si256(out + 30, _mm256_and_si256 ( mask, _mm256_srli_epi32( w1 , 16) ) ); + _mm256_storeu_si256(out + 31, _mm256_srli_epi32( w1 , 24) ); +} + + +/* we packed 256 9-bit values, touching 9 256-bit words, using 144 bytes */ +static void avxunpackblock9(const __m256i * compressed, uint32_t * pout) { + /* we are going to access 9 256-bit words */ + __m256i w0, w1; + __m256i * out = (__m256i *) pout; + const __m256i mask = _mm256_set1_epi32(511); + w0 = _mm256_lddqu_si256 (compressed); + _mm256_storeu_si256(out + 0, _mm256_and_si256 ( mask, w0 ) ); + _mm256_storeu_si256(out + 1, _mm256_and_si256 ( mask, _mm256_srli_epi32( w0 , 9) ) ); + _mm256_storeu_si256(out + 2, _mm256_and_si256 ( mask, _mm256_srli_epi32( w0 , 18) ) ); + w1 = _mm256_lddqu_si256 (compressed + 1); + _mm256_storeu_si256(out + 3, + _mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w0 , 27) ,_mm256_slli_epi32( w1 , 5 ) ) ) ); + _mm256_storeu_si256(out + 4, _mm256_and_si256 ( mask, _mm256_srli_epi32( w1 , 4) ) ); + _mm256_storeu_si256(out + 5, _mm256_and_si256 ( mask, _mm256_srli_epi32( w1 , 13) ) ); + _mm256_storeu_si256(out + 6, _mm256_and_si256 ( mask, _mm256_srli_epi32( w1 , 22) ) ); + w0 = _mm256_lddqu_si256 (compressed + 2); + _mm256_storeu_si256(out + 7, + _mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w1 , 31) ,_mm256_slli_epi32( w0 , 1 ) ) ) ); + _mm256_storeu_si256(out + 8, _mm256_and_si256 ( mask, _mm256_srli_epi32( w0 , 8) ) ); + _mm256_storeu_si256(out + 9, _mm256_and_si256 ( mask, _mm256_srli_epi32( w0 , 17) ) ); + w1 = _mm256_lddqu_si256 (compressed + 3); + _mm256_storeu_si256(out + 10, + _mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w0 , 26) ,_mm256_slli_epi32( w1 , 6 ) ) ) ); + _mm256_storeu_si256(out + 11, _mm256_and_si256 ( mask, _mm256_srli_epi32( w1 , 3) ) ); + _mm256_storeu_si256(out + 12, _mm256_and_si256 ( mask, _mm256_srli_epi32( w1 , 12) ) ); + _mm256_storeu_si256(out + 13, _mm256_and_si256 ( mask, _mm256_srli_epi32( w1 , 21) ) ); + w0 = _mm256_lddqu_si256 (compressed + 4); + _mm256_storeu_si256(out + 14, + _mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w1 , 30) ,_mm256_slli_epi32( w0 , 2 ) ) ) ); + _mm256_storeu_si256(out + 15, _mm256_and_si256 ( mask, _mm256_srli_epi32( w0 , 7) ) ); + _mm256_storeu_si256(out + 16, _mm256_and_si256 ( mask, _mm256_srli_epi32( w0 , 16) ) ); + w1 = _mm256_lddqu_si256 (compressed + 5); + _mm256_storeu_si256(out + 17, + _mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w0 , 25) ,_mm256_slli_epi32( w1 , 7 ) ) ) ); + _mm256_storeu_si256(out + 18, _mm256_and_si256 ( mask, _mm256_srli_epi32( w1 , 2) ) ); + _mm256_storeu_si256(out + 19, _mm256_and_si256 ( mask, _mm256_srli_epi32( w1 , 11) ) ); + _mm256_storeu_si256(out + 20, _mm256_and_si256 ( mask, _mm256_srli_epi32( w1 , 20) ) ); + w0 = _mm256_lddqu_si256 (compressed + 6); + _mm256_storeu_si256(out + 21, + _mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w1 , 29) ,_mm256_slli_epi32( w0 , 3 ) ) ) ); + _mm256_storeu_si256(out + 22, _mm256_and_si256 ( mask, _mm256_srli_epi32( w0 , 6) ) ); + _mm256_storeu_si256(out + 23, _mm256_and_si256 ( mask, _mm256_srli_epi32( w0 , 15) ) ); + w1 = _mm256_lddqu_si256 (compressed + 7); + _mm256_storeu_si256(out + 24, + _mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w0 , 24) ,_mm256_slli_epi32( w1 , 8 ) ) ) ); + _mm256_storeu_si256(out + 25, _mm256_and_si256 ( mask, _mm256_srli_epi32( w1 , 1) ) ); + _mm256_storeu_si256(out + 26, _mm256_and_si256 ( mask, _mm256_srli_epi32( w1 , 10) ) ); + _mm256_storeu_si256(out + 27, _mm256_and_si256 ( mask, _mm256_srli_epi32( w1 , 19) ) ); + w0 = _mm256_lddqu_si256 (compressed + 8); + _mm256_storeu_si256(out + 28, + _mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w1 , 28) ,_mm256_slli_epi32( w0 , 4 ) ) ) ); + _mm256_storeu_si256(out + 29, _mm256_and_si256 ( mask, _mm256_srli_epi32( w0 , 5) ) ); + _mm256_storeu_si256(out + 30, _mm256_and_si256 ( mask, _mm256_srli_epi32( w0 , 14) ) ); + _mm256_storeu_si256(out + 31, _mm256_srli_epi32( w0 , 23) ); +} + + +/* we packed 256 10-bit values, touching 10 256-bit words, using 160 bytes */ +static void avxunpackblock10(const __m256i * compressed, uint32_t * pout) { + /* we are going to access 10 256-bit words */ + __m256i w0, w1; + __m256i * out = (__m256i *) pout; + const __m256i mask = _mm256_set1_epi32(1023); + w0 = _mm256_lddqu_si256 (compressed); + _mm256_storeu_si256(out + 0, _mm256_and_si256 ( mask, w0 ) ); + _mm256_storeu_si256(out + 1, _mm256_and_si256 ( mask, _mm256_srli_epi32( w0 , 10) ) ); + _mm256_storeu_si256(out + 2, _mm256_and_si256 ( mask, _mm256_srli_epi32( w0 , 20) ) ); + w1 = _mm256_lddqu_si256 (compressed + 1); + _mm256_storeu_si256(out + 3, + _mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w0 , 30) ,_mm256_slli_epi32( w1 , 2 ) ) ) ); + _mm256_storeu_si256(out + 4, _mm256_and_si256 ( mask, _mm256_srli_epi32( w1 , 8) ) ); + _mm256_storeu_si256(out + 5, _mm256_and_si256 ( mask, _mm256_srli_epi32( w1 , 18) ) ); + w0 = _mm256_lddqu_si256 (compressed + 2); + _mm256_storeu_si256(out + 6, + _mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w1 , 28) ,_mm256_slli_epi32( w0 , 4 ) ) ) ); + _mm256_storeu_si256(out + 7, _mm256_and_si256 ( mask, _mm256_srli_epi32( w0 , 6) ) ); + _mm256_storeu_si256(out + 8, _mm256_and_si256 ( mask, _mm256_srli_epi32( w0 , 16) ) ); + w1 = _mm256_lddqu_si256 (compressed + 3); + _mm256_storeu_si256(out + 9, + _mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w0 , 26) ,_mm256_slli_epi32( w1 , 6 ) ) ) ); + _mm256_storeu_si256(out + 10, _mm256_and_si256 ( mask, _mm256_srli_epi32( w1 , 4) ) ); + _mm256_storeu_si256(out + 11, _mm256_and_si256 ( mask, _mm256_srli_epi32( w1 , 14) ) ); + w0 = _mm256_lddqu_si256 (compressed + 4); + _mm256_storeu_si256(out + 12, + _mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w1 , 24) ,_mm256_slli_epi32( w0 , 8 ) ) ) ); + _mm256_storeu_si256(out + 13, _mm256_and_si256 ( mask, _mm256_srli_epi32( w0 , 2) ) ); + _mm256_storeu_si256(out + 14, _mm256_and_si256 ( mask, _mm256_srli_epi32( w0 , 12) ) ); + _mm256_storeu_si256(out + 15, _mm256_srli_epi32( w0 , 22) ); + w1 = _mm256_lddqu_si256 (compressed + 5); + _mm256_storeu_si256(out + 16, _mm256_and_si256 ( mask, w1 ) ); + _mm256_storeu_si256(out + 17, _mm256_and_si256 ( mask, _mm256_srli_epi32( w1 , 10) ) ); + _mm256_storeu_si256(out + 18, _mm256_and_si256 ( mask, _mm256_srli_epi32( w1 , 20) ) ); + w0 = _mm256_lddqu_si256 (compressed + 6); + _mm256_storeu_si256(out + 19, + _mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w1 , 30) ,_mm256_slli_epi32( w0 , 2 ) ) ) ); + _mm256_storeu_si256(out + 20, _mm256_and_si256 ( mask, _mm256_srli_epi32( w0 , 8) ) ); + _mm256_storeu_si256(out + 21, _mm256_and_si256 ( mask, _mm256_srli_epi32( w0 , 18) ) ); + w1 = _mm256_lddqu_si256 (compressed + 7); + _mm256_storeu_si256(out + 22, + _mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w0 , 28) ,_mm256_slli_epi32( w1 , 4 ) ) ) ); + _mm256_storeu_si256(out + 23, _mm256_and_si256 ( mask, _mm256_srli_epi32( w1 , 6) ) ); + _mm256_storeu_si256(out + 24, _mm256_and_si256 ( mask, _mm256_srli_epi32( w1 , 16) ) ); + w0 = _mm256_lddqu_si256 (compressed + 8); + _mm256_storeu_si256(out + 25, + _mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w1 , 26) ,_mm256_slli_epi32( w0 , 6 ) ) ) ); + _mm256_storeu_si256(out + 26, _mm256_and_si256 ( mask, _mm256_srli_epi32( w0 , 4) ) ); + _mm256_storeu_si256(out + 27, _mm256_and_si256 ( mask, _mm256_srli_epi32( w0 , 14) ) ); + w1 = _mm256_lddqu_si256 (compressed + 9); + _mm256_storeu_si256(out + 28, + _mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w0 , 24) ,_mm256_slli_epi32( w1 , 8 ) ) ) ); + _mm256_storeu_si256(out + 29, _mm256_and_si256 ( mask, _mm256_srli_epi32( w1 , 2) ) ); + _mm256_storeu_si256(out + 30, _mm256_and_si256 ( mask, _mm256_srli_epi32( w1 , 12) ) ); + _mm256_storeu_si256(out + 31, _mm256_srli_epi32( w1 , 22) ); +} + + +/* we packed 256 11-bit values, touching 11 256-bit words, using 176 bytes */ +static void avxunpackblock11(const __m256i * compressed, uint32_t * pout) { + /* we are going to access 11 256-bit words */ + __m256i w0, w1; + __m256i * out = (__m256i *) pout; + const __m256i mask = _mm256_set1_epi32(2047); + w0 = _mm256_lddqu_si256 (compressed); + _mm256_storeu_si256(out + 0, _mm256_and_si256 ( mask, w0 ) ); + _mm256_storeu_si256(out + 1, _mm256_and_si256 ( mask, _mm256_srli_epi32( w0 , 11) ) ); + w1 = _mm256_lddqu_si256 (compressed + 1); + _mm256_storeu_si256(out + 2, + _mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w0 , 22) ,_mm256_slli_epi32( w1 , 10 ) ) ) ); + _mm256_storeu_si256(out + 3, _mm256_and_si256 ( mask, _mm256_srli_epi32( w1 , 1) ) ); + _mm256_storeu_si256(out + 4, _mm256_and_si256 ( mask, _mm256_srli_epi32( w1 , 12) ) ); + w0 = _mm256_lddqu_si256 (compressed + 2); + _mm256_storeu_si256(out + 5, + _mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w1 , 23) ,_mm256_slli_epi32( w0 , 9 ) ) ) ); + _mm256_storeu_si256(out + 6, _mm256_and_si256 ( mask, _mm256_srli_epi32( w0 , 2) ) ); + _mm256_storeu_si256(out + 7, _mm256_and_si256 ( mask, _mm256_srli_epi32( w0 , 13) ) ); + w1 = _mm256_lddqu_si256 (compressed + 3); + _mm256_storeu_si256(out + 8, + _mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w0 , 24) ,_mm256_slli_epi32( w1 , 8 ) ) ) ); + _mm256_storeu_si256(out + 9, _mm256_and_si256 ( mask, _mm256_srli_epi32( w1 , 3) ) ); + _mm256_storeu_si256(out + 10, _mm256_and_si256 ( mask, _mm256_srli_epi32( w1 , 14) ) ); + w0 = _mm256_lddqu_si256 (compressed + 4); + _mm256_storeu_si256(out + 11, + _mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w1 , 25) ,_mm256_slli_epi32( w0 , 7 ) ) ) ); + _mm256_storeu_si256(out + 12, _mm256_and_si256 ( mask, _mm256_srli_epi32( w0 , 4) ) ); + _mm256_storeu_si256(out + 13, _mm256_and_si256 ( mask, _mm256_srli_epi32( w0 , 15) ) ); + w1 = _mm256_lddqu_si256 (compressed + 5); + _mm256_storeu_si256(out + 14, + _mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w0 , 26) ,_mm256_slli_epi32( w1 , 6 ) ) ) ); + _mm256_storeu_si256(out + 15, _mm256_and_si256 ( mask, _mm256_srli_epi32( w1 , 5) ) ); + _mm256_storeu_si256(out + 16, _mm256_and_si256 ( mask, _mm256_srli_epi32( w1 , 16) ) ); + w0 = _mm256_lddqu_si256 (compressed + 6); + _mm256_storeu_si256(out + 17, + _mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w1 , 27) ,_mm256_slli_epi32( w0 , 5 ) ) ) ); + _mm256_storeu_si256(out + 18, _mm256_and_si256 ( mask, _mm256_srli_epi32( w0 , 6) ) ); + _mm256_storeu_si256(out + 19, _mm256_and_si256 ( mask, _mm256_srli_epi32( w0 , 17) ) ); + w1 = _mm256_lddqu_si256 (compressed + 7); + _mm256_storeu_si256(out + 20, + _mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w0 , 28) ,_mm256_slli_epi32( w1 , 4 ) ) ) ); + _mm256_storeu_si256(out + 21, _mm256_and_si256 ( mask, _mm256_srli_epi32( w1 , 7) ) ); + _mm256_storeu_si256(out + 22, _mm256_and_si256 ( mask, _mm256_srli_epi32( w1 , 18) ) ); + w0 = _mm256_lddqu_si256 (compressed + 8); + _mm256_storeu_si256(out + 23, + _mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w1 , 29) ,_mm256_slli_epi32( w0 , 3 ) ) ) ); + _mm256_storeu_si256(out + 24, _mm256_and_si256 ( mask, _mm256_srli_epi32( w0 , 8) ) ); + _mm256_storeu_si256(out + 25, _mm256_and_si256 ( mask, _mm256_srli_epi32( w0 , 19) ) ); + w1 = _mm256_lddqu_si256 (compressed + 9); + _mm256_storeu_si256(out + 26, + _mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w0 , 30) ,_mm256_slli_epi32( w1 , 2 ) ) ) ); + _mm256_storeu_si256(out + 27, _mm256_and_si256 ( mask, _mm256_srli_epi32( w1 , 9) ) ); + _mm256_storeu_si256(out + 28, _mm256_and_si256 ( mask, _mm256_srli_epi32( w1 , 20) ) ); + w0 = _mm256_lddqu_si256 (compressed + 10); + _mm256_storeu_si256(out + 29, + _mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w1 , 31) ,_mm256_slli_epi32( w0 , 1 ) ) ) ); + _mm256_storeu_si256(out + 30, _mm256_and_si256 ( mask, _mm256_srli_epi32( w0 , 10) ) ); + _mm256_storeu_si256(out + 31, _mm256_srli_epi32( w0 , 21) ); +} + + +/* we packed 256 12-bit values, touching 12 256-bit words, using 192 bytes */ +static void avxunpackblock12(const __m256i * compressed, uint32_t * pout) { + /* we are going to access 12 256-bit words */ + __m256i w0, w1; + __m256i * out = (__m256i *) pout; + const __m256i mask = _mm256_set1_epi32(4095); + w0 = _mm256_lddqu_si256 (compressed); + _mm256_storeu_si256(out + 0, _mm256_and_si256 ( mask, w0 ) ); + _mm256_storeu_si256(out + 1, _mm256_and_si256 ( mask, _mm256_srli_epi32( w0 , 12) ) ); + w1 = _mm256_lddqu_si256 (compressed + 1); + _mm256_storeu_si256(out + 2, + _mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w0 , 24) ,_mm256_slli_epi32( w1 , 8 ) ) ) ); + _mm256_storeu_si256(out + 3, _mm256_and_si256 ( mask, _mm256_srli_epi32( w1 , 4) ) ); + _mm256_storeu_si256(out + 4, _mm256_and_si256 ( mask, _mm256_srli_epi32( w1 , 16) ) ); + w0 = _mm256_lddqu_si256 (compressed + 2); + _mm256_storeu_si256(out + 5, + _mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w1 , 28) ,_mm256_slli_epi32( w0 , 4 ) ) ) ); + _mm256_storeu_si256(out + 6, _mm256_and_si256 ( mask, _mm256_srli_epi32( w0 , 8) ) ); + _mm256_storeu_si256(out + 7, _mm256_srli_epi32( w0 , 20) ); + w1 = _mm256_lddqu_si256 (compressed + 3); + _mm256_storeu_si256(out + 8, _mm256_and_si256 ( mask, w1 ) ); + _mm256_storeu_si256(out + 9, _mm256_and_si256 ( mask, _mm256_srli_epi32( w1 , 12) ) ); + w0 = _mm256_lddqu_si256 (compressed + 4); + _mm256_storeu_si256(out + 10, + _mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w1 , 24) ,_mm256_slli_epi32( w0 , 8 ) ) ) ); + _mm256_storeu_si256(out + 11, _mm256_and_si256 ( mask, _mm256_srli_epi32( w0 , 4) ) ); + _mm256_storeu_si256(out + 12, _mm256_and_si256 ( mask, _mm256_srli_epi32( w0 , 16) ) ); + w1 = _mm256_lddqu_si256 (compressed + 5); + _mm256_storeu_si256(out + 13, + _mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w0 , 28) ,_mm256_slli_epi32( w1 , 4 ) ) ) ); + _mm256_storeu_si256(out + 14, _mm256_and_si256 ( mask, _mm256_srli_epi32( w1 , 8) ) ); + _mm256_storeu_si256(out + 15, _mm256_srli_epi32( w1 , 20) ); + w0 = _mm256_lddqu_si256 (compressed + 6); + _mm256_storeu_si256(out + 16, _mm256_and_si256 ( mask, w0 ) ); + _mm256_storeu_si256(out + 17, _mm256_and_si256 ( mask, _mm256_srli_epi32( w0 , 12) ) ); + w1 = _mm256_lddqu_si256 (compressed + 7); + _mm256_storeu_si256(out + 18, + _mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w0 , 24) ,_mm256_slli_epi32( w1 , 8 ) ) ) ); + _mm256_storeu_si256(out + 19, _mm256_and_si256 ( mask, _mm256_srli_epi32( w1 , 4) ) ); + _mm256_storeu_si256(out + 20, _mm256_and_si256 ( mask, _mm256_srli_epi32( w1 , 16) ) ); + w0 = _mm256_lddqu_si256 (compressed + 8); + _mm256_storeu_si256(out + 21, + _mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w1 , 28) ,_mm256_slli_epi32( w0 , 4 ) ) ) ); + _mm256_storeu_si256(out + 22, _mm256_and_si256 ( mask, _mm256_srli_epi32( w0 , 8) ) ); + _mm256_storeu_si256(out + 23, _mm256_srli_epi32( w0 , 20) ); + w1 = _mm256_lddqu_si256 (compressed + 9); + _mm256_storeu_si256(out + 24, _mm256_and_si256 ( mask, w1 ) ); + _mm256_storeu_si256(out + 25, _mm256_and_si256 ( mask, _mm256_srli_epi32( w1 , 12) ) ); + w0 = _mm256_lddqu_si256 (compressed + 10); + _mm256_storeu_si256(out + 26, + _mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w1 , 24) ,_mm256_slli_epi32( w0 , 8 ) ) ) ); + _mm256_storeu_si256(out + 27, _mm256_and_si256 ( mask, _mm256_srli_epi32( w0 , 4) ) ); + _mm256_storeu_si256(out + 28, _mm256_and_si256 ( mask, _mm256_srli_epi32( w0 , 16) ) ); + w1 = _mm256_lddqu_si256 (compressed + 11); + _mm256_storeu_si256(out + 29, + _mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w0 , 28) ,_mm256_slli_epi32( w1 , 4 ) ) ) ); + _mm256_storeu_si256(out + 30, _mm256_and_si256 ( mask, _mm256_srli_epi32( w1 , 8) ) ); + _mm256_storeu_si256(out + 31, _mm256_srli_epi32( w1 , 20) ); +} + + +/* we packed 256 13-bit values, touching 13 256-bit words, using 208 bytes */ +static void avxunpackblock13(const __m256i * compressed, uint32_t * pout) { + /* we are going to access 13 256-bit words */ + __m256i w0, w1; + __m256i * out = (__m256i *) pout; + const __m256i mask = _mm256_set1_epi32(8191); + w0 = _mm256_lddqu_si256 (compressed); + _mm256_storeu_si256(out + 0, _mm256_and_si256 ( mask, w0 ) ); + _mm256_storeu_si256(out + 1, _mm256_and_si256 ( mask, _mm256_srli_epi32( w0 , 13) ) ); + w1 = _mm256_lddqu_si256 (compressed + 1); + _mm256_storeu_si256(out + 2, + _mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w0 , 26) ,_mm256_slli_epi32( w1 , 6 ) ) ) ); + _mm256_storeu_si256(out + 3, _mm256_and_si256 ( mask, _mm256_srli_epi32( w1 , 7) ) ); + w0 = _mm256_lddqu_si256 (compressed + 2); + _mm256_storeu_si256(out + 4, + _mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w1 , 20) ,_mm256_slli_epi32( w0 , 12 ) ) ) ); + _mm256_storeu_si256(out + 5, _mm256_and_si256 ( mask, _mm256_srli_epi32( w0 , 1) ) ); + _mm256_storeu_si256(out + 6, _mm256_and_si256 ( mask, _mm256_srli_epi32( w0 , 14) ) ); + w1 = _mm256_lddqu_si256 (compressed + 3); + _mm256_storeu_si256(out + 7, + _mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w0 , 27) ,_mm256_slli_epi32( w1 , 5 ) ) ) ); + _mm256_storeu_si256(out + 8, _mm256_and_si256 ( mask, _mm256_srli_epi32( w1 , 8) ) ); + w0 = _mm256_lddqu_si256 (compressed + 4); + _mm256_storeu_si256(out + 9, + _mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w1 , 21) ,_mm256_slli_epi32( w0 , 11 ) ) ) ); + _mm256_storeu_si256(out + 10, _mm256_and_si256 ( mask, _mm256_srli_epi32( w0 , 2) ) ); + _mm256_storeu_si256(out + 11, _mm256_and_si256 ( mask, _mm256_srli_epi32( w0 , 15) ) ); + w1 = _mm256_lddqu_si256 (compressed + 5); + _mm256_storeu_si256(out + 12, + _mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w0 , 28) ,_mm256_slli_epi32( w1 , 4 ) ) ) ); + _mm256_storeu_si256(out + 13, _mm256_and_si256 ( mask, _mm256_srli_epi32( w1 , 9) ) ); + w0 = _mm256_lddqu_si256 (compressed + 6); + _mm256_storeu_si256(out + 14, + _mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w1 , 22) ,_mm256_slli_epi32( w0 , 10 ) ) ) ); + _mm256_storeu_si256(out + 15, _mm256_and_si256 ( mask, _mm256_srli_epi32( w0 , 3) ) ); + _mm256_storeu_si256(out + 16, _mm256_and_si256 ( mask, _mm256_srli_epi32( w0 , 16) ) ); + w1 = _mm256_lddqu_si256 (compressed + 7); + _mm256_storeu_si256(out + 17, + _mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w0 , 29) ,_mm256_slli_epi32( w1 , 3 ) ) ) ); + _mm256_storeu_si256(out + 18, _mm256_and_si256 ( mask, _mm256_srli_epi32( w1 , 10) ) ); + w0 = _mm256_lddqu_si256 (compressed + 8); + _mm256_storeu_si256(out + 19, + _mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w1 , 23) ,_mm256_slli_epi32( w0 , 9 ) ) ) ); + _mm256_storeu_si256(out + 20, _mm256_and_si256 ( mask, _mm256_srli_epi32( w0 , 4) ) ); + _mm256_storeu_si256(out + 21, _mm256_and_si256 ( mask, _mm256_srli_epi32( w0 , 17) ) ); + w1 = _mm256_lddqu_si256 (compressed + 9); + _mm256_storeu_si256(out + 22, + _mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w0 , 30) ,_mm256_slli_epi32( w1 , 2 ) ) ) ); + _mm256_storeu_si256(out + 23, _mm256_and_si256 ( mask, _mm256_srli_epi32( w1 , 11) ) ); + w0 = _mm256_lddqu_si256 (compressed + 10); + _mm256_storeu_si256(out + 24, + _mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w1 , 24) ,_mm256_slli_epi32( w0 , 8 ) ) ) ); + _mm256_storeu_si256(out + 25, _mm256_and_si256 ( mask, _mm256_srli_epi32( w0 , 5) ) ); + _mm256_storeu_si256(out + 26, _mm256_and_si256 ( mask, _mm256_srli_epi32( w0 , 18) ) ); + w1 = _mm256_lddqu_si256 (compressed + 11); + _mm256_storeu_si256(out + 27, + _mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w0 , 31) ,_mm256_slli_epi32( w1 , 1 ) ) ) ); + _mm256_storeu_si256(out + 28, _mm256_and_si256 ( mask, _mm256_srli_epi32( w1 , 12) ) ); + w0 = _mm256_lddqu_si256 (compressed + 12); + _mm256_storeu_si256(out + 29, + _mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w1 , 25) ,_mm256_slli_epi32( w0 , 7 ) ) ) ); + _mm256_storeu_si256(out + 30, _mm256_and_si256 ( mask, _mm256_srli_epi32( w0 , 6) ) ); + _mm256_storeu_si256(out + 31, _mm256_srli_epi32( w0 , 19) ); +} + + +/* we packed 256 14-bit values, touching 14 256-bit words, using 224 bytes */ +static void avxunpackblock14(const __m256i * compressed, uint32_t * pout) { + /* we are going to access 14 256-bit words */ + __m256i w0, w1; + __m256i * out = (__m256i *) pout; + const __m256i mask = _mm256_set1_epi32(16383); + w0 = _mm256_lddqu_si256 (compressed); + _mm256_storeu_si256(out + 0, _mm256_and_si256 ( mask, w0 ) ); + _mm256_storeu_si256(out + 1, _mm256_and_si256 ( mask, _mm256_srli_epi32( w0 , 14) ) ); + w1 = _mm256_lddqu_si256 (compressed + 1); + _mm256_storeu_si256(out + 2, + _mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w0 , 28) ,_mm256_slli_epi32( w1 , 4 ) ) ) ); + _mm256_storeu_si256(out + 3, _mm256_and_si256 ( mask, _mm256_srli_epi32( w1 , 10) ) ); + w0 = _mm256_lddqu_si256 (compressed + 2); + _mm256_storeu_si256(out + 4, + _mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w1 , 24) ,_mm256_slli_epi32( w0 , 8 ) ) ) ); + _mm256_storeu_si256(out + 5, _mm256_and_si256 ( mask, _mm256_srli_epi32( w0 , 6) ) ); + w1 = _mm256_lddqu_si256 (compressed + 3); + _mm256_storeu_si256(out + 6, + _mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w0 , 20) ,_mm256_slli_epi32( w1 , 12 ) ) ) ); + _mm256_storeu_si256(out + 7, _mm256_and_si256 ( mask, _mm256_srli_epi32( w1 , 2) ) ); + _mm256_storeu_si256(out + 8, _mm256_and_si256 ( mask, _mm256_srli_epi32( w1 , 16) ) ); + w0 = _mm256_lddqu_si256 (compressed + 4); + _mm256_storeu_si256(out + 9, + _mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w1 , 30) ,_mm256_slli_epi32( w0 , 2 ) ) ) ); + _mm256_storeu_si256(out + 10, _mm256_and_si256 ( mask, _mm256_srli_epi32( w0 , 12) ) ); + w1 = _mm256_lddqu_si256 (compressed + 5); + _mm256_storeu_si256(out + 11, + _mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w0 , 26) ,_mm256_slli_epi32( w1 , 6 ) ) ) ); + _mm256_storeu_si256(out + 12, _mm256_and_si256 ( mask, _mm256_srli_epi32( w1 , 8) ) ); + w0 = _mm256_lddqu_si256 (compressed + 6); + _mm256_storeu_si256(out + 13, + _mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w1 , 22) ,_mm256_slli_epi32( w0 , 10 ) ) ) ); + _mm256_storeu_si256(out + 14, _mm256_and_si256 ( mask, _mm256_srli_epi32( w0 , 4) ) ); + _mm256_storeu_si256(out + 15, _mm256_srli_epi32( w0 , 18) ); + w1 = _mm256_lddqu_si256 (compressed + 7); + _mm256_storeu_si256(out + 16, _mm256_and_si256 ( mask, w1 ) ); + _mm256_storeu_si256(out + 17, _mm256_and_si256 ( mask, _mm256_srli_epi32( w1 , 14) ) ); + w0 = _mm256_lddqu_si256 (compressed + 8); + _mm256_storeu_si256(out + 18, + _mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w1 , 28) ,_mm256_slli_epi32( w0 , 4 ) ) ) ); + _mm256_storeu_si256(out + 19, _mm256_and_si256 ( mask, _mm256_srli_epi32( w0 , 10) ) ); + w1 = _mm256_lddqu_si256 (compressed + 9); + _mm256_storeu_si256(out + 20, + _mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w0 , 24) ,_mm256_slli_epi32( w1 , 8 ) ) ) ); + _mm256_storeu_si256(out + 21, _mm256_and_si256 ( mask, _mm256_srli_epi32( w1 , 6) ) ); + w0 = _mm256_lddqu_si256 (compressed + 10); + _mm256_storeu_si256(out + 22, + _mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w1 , 20) ,_mm256_slli_epi32( w0 , 12 ) ) ) ); + _mm256_storeu_si256(out + 23, _mm256_and_si256 ( mask, _mm256_srli_epi32( w0 , 2) ) ); + _mm256_storeu_si256(out + 24, _mm256_and_si256 ( mask, _mm256_srli_epi32( w0 , 16) ) ); + w1 = _mm256_lddqu_si256 (compressed + 11); + _mm256_storeu_si256(out + 25, + _mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w0 , 30) ,_mm256_slli_epi32( w1 , 2 ) ) ) ); + _mm256_storeu_si256(out + 26, _mm256_and_si256 ( mask, _mm256_srli_epi32( w1 , 12) ) ); + w0 = _mm256_lddqu_si256 (compressed + 12); + _mm256_storeu_si256(out + 27, + _mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w1 , 26) ,_mm256_slli_epi32( w0 , 6 ) ) ) ); + _mm256_storeu_si256(out + 28, _mm256_and_si256 ( mask, _mm256_srli_epi32( w0 , 8) ) ); + w1 = _mm256_lddqu_si256 (compressed + 13); + _mm256_storeu_si256(out + 29, + _mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w0 , 22) ,_mm256_slli_epi32( w1 , 10 ) ) ) ); + _mm256_storeu_si256(out + 30, _mm256_and_si256 ( mask, _mm256_srli_epi32( w1 , 4) ) ); + _mm256_storeu_si256(out + 31, _mm256_srli_epi32( w1 , 18) ); +} + + +/* we packed 256 15-bit values, touching 15 256-bit words, using 240 bytes */ +static void avxunpackblock15(const __m256i * compressed, uint32_t * pout) { + /* we are going to access 15 256-bit words */ + __m256i w0, w1; + __m256i * out = (__m256i *) pout; + const __m256i mask = _mm256_set1_epi32(32767); + w0 = _mm256_lddqu_si256 (compressed); + _mm256_storeu_si256(out + 0, _mm256_and_si256 ( mask, w0 ) ); + _mm256_storeu_si256(out + 1, _mm256_and_si256 ( mask, _mm256_srli_epi32( w0 , 15) ) ); + w1 = _mm256_lddqu_si256 (compressed + 1); + _mm256_storeu_si256(out + 2, + _mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w0 , 30) ,_mm256_slli_epi32( w1 , 2 ) ) ) ); + _mm256_storeu_si256(out + 3, _mm256_and_si256 ( mask, _mm256_srli_epi32( w1 , 13) ) ); + w0 = _mm256_lddqu_si256 (compressed + 2); + _mm256_storeu_si256(out + 4, + _mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w1 , 28) ,_mm256_slli_epi32( w0 , 4 ) ) ) ); + _mm256_storeu_si256(out + 5, _mm256_and_si256 ( mask, _mm256_srli_epi32( w0 , 11) ) ); + w1 = _mm256_lddqu_si256 (compressed + 3); + _mm256_storeu_si256(out + 6, + _mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w0 , 26) ,_mm256_slli_epi32( w1 , 6 ) ) ) ); + _mm256_storeu_si256(out + 7, _mm256_and_si256 ( mask, _mm256_srli_epi32( w1 , 9) ) ); + w0 = _mm256_lddqu_si256 (compressed + 4); + _mm256_storeu_si256(out + 8, + _mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w1 , 24) ,_mm256_slli_epi32( w0 , 8 ) ) ) ); + _mm256_storeu_si256(out + 9, _mm256_and_si256 ( mask, _mm256_srli_epi32( w0 , 7) ) ); + w1 = _mm256_lddqu_si256 (compressed + 5); + _mm256_storeu_si256(out + 10, + _mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w0 , 22) ,_mm256_slli_epi32( w1 , 10 ) ) ) ); + _mm256_storeu_si256(out + 11, _mm256_and_si256 ( mask, _mm256_srli_epi32( w1 , 5) ) ); + w0 = _mm256_lddqu_si256 (compressed + 6); + _mm256_storeu_si256(out + 12, + _mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w1 , 20) ,_mm256_slli_epi32( w0 , 12 ) ) ) ); + _mm256_storeu_si256(out + 13, _mm256_and_si256 ( mask, _mm256_srli_epi32( w0 , 3) ) ); + w1 = _mm256_lddqu_si256 (compressed + 7); + _mm256_storeu_si256(out + 14, + _mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w0 , 18) ,_mm256_slli_epi32( w1 , 14 ) ) ) ); + _mm256_storeu_si256(out + 15, _mm256_and_si256 ( mask, _mm256_srli_epi32( w1 , 1) ) ); + _mm256_storeu_si256(out + 16, _mm256_and_si256 ( mask, _mm256_srli_epi32( w1 , 16) ) ); + w0 = _mm256_lddqu_si256 (compressed + 8); + _mm256_storeu_si256(out + 17, + _mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w1 , 31) ,_mm256_slli_epi32( w0 , 1 ) ) ) ); + _mm256_storeu_si256(out + 18, _mm256_and_si256 ( mask, _mm256_srli_epi32( w0 , 14) ) ); + w1 = _mm256_lddqu_si256 (compressed + 9); + _mm256_storeu_si256(out + 19, + _mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w0 , 29) ,_mm256_slli_epi32( w1 , 3 ) ) ) ); + _mm256_storeu_si256(out + 20, _mm256_and_si256 ( mask, _mm256_srli_epi32( w1 , 12) ) ); + w0 = _mm256_lddqu_si256 (compressed + 10); + _mm256_storeu_si256(out + 21, + _mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w1 , 27) ,_mm256_slli_epi32( w0 , 5 ) ) ) ); + _mm256_storeu_si256(out + 22, _mm256_and_si256 ( mask, _mm256_srli_epi32( w0 , 10) ) ); + w1 = _mm256_lddqu_si256 (compressed + 11); + _mm256_storeu_si256(out + 23, + _mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w0 , 25) ,_mm256_slli_epi32( w1 , 7 ) ) ) ); + _mm256_storeu_si256(out + 24, _mm256_and_si256 ( mask, _mm256_srli_epi32( w1 , 8) ) ); + w0 = _mm256_lddqu_si256 (compressed + 12); + _mm256_storeu_si256(out + 25, + _mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w1 , 23) ,_mm256_slli_epi32( w0 , 9 ) ) ) ); + _mm256_storeu_si256(out + 26, _mm256_and_si256 ( mask, _mm256_srli_epi32( w0 , 6) ) ); + w1 = _mm256_lddqu_si256 (compressed + 13); + _mm256_storeu_si256(out + 27, + _mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w0 , 21) ,_mm256_slli_epi32( w1 , 11 ) ) ) ); + _mm256_storeu_si256(out + 28, _mm256_and_si256 ( mask, _mm256_srli_epi32( w1 , 4) ) ); + w0 = _mm256_lddqu_si256 (compressed + 14); + _mm256_storeu_si256(out + 29, + _mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w1 , 19) ,_mm256_slli_epi32( w0 , 13 ) ) ) ); + _mm256_storeu_si256(out + 30, _mm256_and_si256 ( mask, _mm256_srli_epi32( w0 , 2) ) ); + _mm256_storeu_si256(out + 31, _mm256_srli_epi32( w0 , 17) ); +} + + +/* we packed 256 16-bit values, touching 16 256-bit words, using 256 bytes */ +static void avxunpackblock16(const __m256i * compressed, uint32_t * pout) { + /* we are going to access 16 256-bit words */ + __m256i w0, w1; + __m256i * out = (__m256i *) pout; + const __m256i mask = _mm256_set1_epi32(65535); + w0 = _mm256_lddqu_si256 (compressed); + _mm256_storeu_si256(out + 0, _mm256_and_si256 ( mask, w0 ) ); + _mm256_storeu_si256(out + 1, _mm256_srli_epi32( w0 , 16) ); + w1 = _mm256_lddqu_si256 (compressed + 1); + _mm256_storeu_si256(out + 2, _mm256_and_si256 ( mask, w1 ) ); + _mm256_storeu_si256(out + 3, _mm256_srli_epi32( w1 , 16) ); + w0 = _mm256_lddqu_si256 (compressed + 2); + _mm256_storeu_si256(out + 4, _mm256_and_si256 ( mask, w0 ) ); + _mm256_storeu_si256(out + 5, _mm256_srli_epi32( w0 , 16) ); + w1 = _mm256_lddqu_si256 (compressed + 3); + _mm256_storeu_si256(out + 6, _mm256_and_si256 ( mask, w1 ) ); + _mm256_storeu_si256(out + 7, _mm256_srli_epi32( w1 , 16) ); + w0 = _mm256_lddqu_si256 (compressed + 4); + _mm256_storeu_si256(out + 8, _mm256_and_si256 ( mask, w0 ) ); + _mm256_storeu_si256(out + 9, _mm256_srli_epi32( w0 , 16) ); + w1 = _mm256_lddqu_si256 (compressed + 5); + _mm256_storeu_si256(out + 10, _mm256_and_si256 ( mask, w1 ) ); + _mm256_storeu_si256(out + 11, _mm256_srli_epi32( w1 , 16) ); + w0 = _mm256_lddqu_si256 (compressed + 6); + _mm256_storeu_si256(out + 12, _mm256_and_si256 ( mask, w0 ) ); + _mm256_storeu_si256(out + 13, _mm256_srli_epi32( w0 , 16) ); + w1 = _mm256_lddqu_si256 (compressed + 7); + _mm256_storeu_si256(out + 14, _mm256_and_si256 ( mask, w1 ) ); + _mm256_storeu_si256(out + 15, _mm256_srli_epi32( w1 , 16) ); + w0 = _mm256_lddqu_si256 (compressed + 8); + _mm256_storeu_si256(out + 16, _mm256_and_si256 ( mask, w0 ) ); + _mm256_storeu_si256(out + 17, _mm256_srli_epi32( w0 , 16) ); + w1 = _mm256_lddqu_si256 (compressed + 9); + _mm256_storeu_si256(out + 18, _mm256_and_si256 ( mask, w1 ) ); + _mm256_storeu_si256(out + 19, _mm256_srli_epi32( w1 , 16) ); + w0 = _mm256_lddqu_si256 (compressed + 10); + _mm256_storeu_si256(out + 20, _mm256_and_si256 ( mask, w0 ) ); + _mm256_storeu_si256(out + 21, _mm256_srli_epi32( w0 , 16) ); + w1 = _mm256_lddqu_si256 (compressed + 11); + _mm256_storeu_si256(out + 22, _mm256_and_si256 ( mask, w1 ) ); + _mm256_storeu_si256(out + 23, _mm256_srli_epi32( w1 , 16) ); + w0 = _mm256_lddqu_si256 (compressed + 12); + _mm256_storeu_si256(out + 24, _mm256_and_si256 ( mask, w0 ) ); + _mm256_storeu_si256(out + 25, _mm256_srli_epi32( w0 , 16) ); + w1 = _mm256_lddqu_si256 (compressed + 13); + _mm256_storeu_si256(out + 26, _mm256_and_si256 ( mask, w1 ) ); + _mm256_storeu_si256(out + 27, _mm256_srli_epi32( w1 , 16) ); + w0 = _mm256_lddqu_si256 (compressed + 14); + _mm256_storeu_si256(out + 28, _mm256_and_si256 ( mask, w0 ) ); + _mm256_storeu_si256(out + 29, _mm256_srli_epi32( w0 , 16) ); + w1 = _mm256_lddqu_si256 (compressed + 15); + _mm256_storeu_si256(out + 30, _mm256_and_si256 ( mask, w1 ) ); + _mm256_storeu_si256(out + 31, _mm256_srli_epi32( w1 , 16) ); +} + + +/* we packed 256 17-bit values, touching 17 256-bit words, using 272 bytes */ +static void avxunpackblock17(const __m256i * compressed, uint32_t * pout) { + /* we are going to access 17 256-bit words */ + __m256i w0, w1; + __m256i * out = (__m256i *) pout; + const __m256i mask = _mm256_set1_epi32(131071); + w0 = _mm256_lddqu_si256 (compressed); + _mm256_storeu_si256(out + 0, _mm256_and_si256 ( mask, w0 ) ); + w1 = _mm256_lddqu_si256 (compressed + 1); + _mm256_storeu_si256(out + 1, + _mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w0 , 17) ,_mm256_slli_epi32( w1 , 15 ) ) ) ); + _mm256_storeu_si256(out + 2, _mm256_and_si256 ( mask, _mm256_srli_epi32( w1 , 2) ) ); + w0 = _mm256_lddqu_si256 (compressed + 2); + _mm256_storeu_si256(out + 3, + _mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w1 , 19) ,_mm256_slli_epi32( w0 , 13 ) ) ) ); + _mm256_storeu_si256(out + 4, _mm256_and_si256 ( mask, _mm256_srli_epi32( w0 , 4) ) ); + w1 = _mm256_lddqu_si256 (compressed + 3); + _mm256_storeu_si256(out + 5, + _mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w0 , 21) ,_mm256_slli_epi32( w1 , 11 ) ) ) ); + _mm256_storeu_si256(out + 6, _mm256_and_si256 ( mask, _mm256_srli_epi32( w1 , 6) ) ); + w0 = _mm256_lddqu_si256 (compressed + 4); + _mm256_storeu_si256(out + 7, + _mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w1 , 23) ,_mm256_slli_epi32( w0 , 9 ) ) ) ); + _mm256_storeu_si256(out + 8, _mm256_and_si256 ( mask, _mm256_srli_epi32( w0 , 8) ) ); + w1 = _mm256_lddqu_si256 (compressed + 5); + _mm256_storeu_si256(out + 9, + _mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w0 , 25) ,_mm256_slli_epi32( w1 , 7 ) ) ) ); + _mm256_storeu_si256(out + 10, _mm256_and_si256 ( mask, _mm256_srli_epi32( w1 , 10) ) ); + w0 = _mm256_lddqu_si256 (compressed + 6); + _mm256_storeu_si256(out + 11, + _mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w1 , 27) ,_mm256_slli_epi32( w0 , 5 ) ) ) ); + _mm256_storeu_si256(out + 12, _mm256_and_si256 ( mask, _mm256_srli_epi32( w0 , 12) ) ); + w1 = _mm256_lddqu_si256 (compressed + 7); + _mm256_storeu_si256(out + 13, + _mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w0 , 29) ,_mm256_slli_epi32( w1 , 3 ) ) ) ); + _mm256_storeu_si256(out + 14, _mm256_and_si256 ( mask, _mm256_srli_epi32( w1 , 14) ) ); + w0 = _mm256_lddqu_si256 (compressed + 8); + _mm256_storeu_si256(out + 15, + _mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w1 , 31) ,_mm256_slli_epi32( w0 , 1 ) ) ) ); + w1 = _mm256_lddqu_si256 (compressed + 9); + _mm256_storeu_si256(out + 16, + _mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w0 , 16) ,_mm256_slli_epi32( w1 , 16 ) ) ) ); + _mm256_storeu_si256(out + 17, _mm256_and_si256 ( mask, _mm256_srli_epi32( w1 , 1) ) ); + w0 = _mm256_lddqu_si256 (compressed + 10); + _mm256_storeu_si256(out + 18, + _mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w1 , 18) ,_mm256_slli_epi32( w0 , 14 ) ) ) ); + _mm256_storeu_si256(out + 19, _mm256_and_si256 ( mask, _mm256_srli_epi32( w0 , 3) ) ); + w1 = _mm256_lddqu_si256 (compressed + 11); + _mm256_storeu_si256(out + 20, + _mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w0 , 20) ,_mm256_slli_epi32( w1 , 12 ) ) ) ); + _mm256_storeu_si256(out + 21, _mm256_and_si256 ( mask, _mm256_srli_epi32( w1 , 5) ) ); + w0 = _mm256_lddqu_si256 (compressed + 12); + _mm256_storeu_si256(out + 22, + _mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w1 , 22) ,_mm256_slli_epi32( w0 , 10 ) ) ) ); + _mm256_storeu_si256(out + 23, _mm256_and_si256 ( mask, _mm256_srli_epi32( w0 , 7) ) ); + w1 = _mm256_lddqu_si256 (compressed + 13); + _mm256_storeu_si256(out + 24, + _mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w0 , 24) ,_mm256_slli_epi32( w1 , 8 ) ) ) ); + _mm256_storeu_si256(out + 25, _mm256_and_si256 ( mask, _mm256_srli_epi32( w1 , 9) ) ); + w0 = _mm256_lddqu_si256 (compressed + 14); + _mm256_storeu_si256(out + 26, + _mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w1 , 26) ,_mm256_slli_epi32( w0 , 6 ) ) ) ); + _mm256_storeu_si256(out + 27, _mm256_and_si256 ( mask, _mm256_srli_epi32( w0 , 11) ) ); + w1 = _mm256_lddqu_si256 (compressed + 15); + _mm256_storeu_si256(out + 28, + _mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w0 , 28) ,_mm256_slli_epi32( w1 , 4 ) ) ) ); + _mm256_storeu_si256(out + 29, _mm256_and_si256 ( mask, _mm256_srli_epi32( w1 , 13) ) ); + w0 = _mm256_lddqu_si256 (compressed + 16); + _mm256_storeu_si256(out + 30, + _mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w1 , 30) ,_mm256_slli_epi32( w0 , 2 ) ) ) ); + _mm256_storeu_si256(out + 31, _mm256_srli_epi32( w0 , 15) ); +} + + +/* we packed 256 18-bit values, touching 18 256-bit words, using 288 bytes */ +static void avxunpackblock18(const __m256i * compressed, uint32_t * pout) { + /* we are going to access 18 256-bit words */ + __m256i w0, w1; + __m256i * out = (__m256i *) pout; + const __m256i mask = _mm256_set1_epi32(262143); + w0 = _mm256_lddqu_si256 (compressed); + _mm256_storeu_si256(out + 0, _mm256_and_si256 ( mask, w0 ) ); + w1 = _mm256_lddqu_si256 (compressed + 1); + _mm256_storeu_si256(out + 1, + _mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w0 , 18) ,_mm256_slli_epi32( w1 , 14 ) ) ) ); + _mm256_storeu_si256(out + 2, _mm256_and_si256 ( mask, _mm256_srli_epi32( w1 , 4) ) ); + w0 = _mm256_lddqu_si256 (compressed + 2); + _mm256_storeu_si256(out + 3, + _mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w1 , 22) ,_mm256_slli_epi32( w0 , 10 ) ) ) ); + _mm256_storeu_si256(out + 4, _mm256_and_si256 ( mask, _mm256_srli_epi32( w0 , 8) ) ); + w1 = _mm256_lddqu_si256 (compressed + 3); + _mm256_storeu_si256(out + 5, + _mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w0 , 26) ,_mm256_slli_epi32( w1 , 6 ) ) ) ); + _mm256_storeu_si256(out + 6, _mm256_and_si256 ( mask, _mm256_srli_epi32( w1 , 12) ) ); + w0 = _mm256_lddqu_si256 (compressed + 4); + _mm256_storeu_si256(out + 7, + _mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w1 , 30) ,_mm256_slli_epi32( w0 , 2 ) ) ) ); + w1 = _mm256_lddqu_si256 (compressed + 5); + _mm256_storeu_si256(out + 8, + _mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w0 , 16) ,_mm256_slli_epi32( w1 , 16 ) ) ) ); + _mm256_storeu_si256(out + 9, _mm256_and_si256 ( mask, _mm256_srli_epi32( w1 , 2) ) ); + w0 = _mm256_lddqu_si256 (compressed + 6); + _mm256_storeu_si256(out + 10, + _mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w1 , 20) ,_mm256_slli_epi32( w0 , 12 ) ) ) ); + _mm256_storeu_si256(out + 11, _mm256_and_si256 ( mask, _mm256_srli_epi32( w0 , 6) ) ); + w1 = _mm256_lddqu_si256 (compressed + 7); + _mm256_storeu_si256(out + 12, + _mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w0 , 24) ,_mm256_slli_epi32( w1 , 8 ) ) ) ); + _mm256_storeu_si256(out + 13, _mm256_and_si256 ( mask, _mm256_srli_epi32( w1 , 10) ) ); + w0 = _mm256_lddqu_si256 (compressed + 8); + _mm256_storeu_si256(out + 14, + _mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w1 , 28) ,_mm256_slli_epi32( w0 , 4 ) ) ) ); + _mm256_storeu_si256(out + 15, _mm256_srli_epi32( w0 , 14) ); + w1 = _mm256_lddqu_si256 (compressed + 9); + _mm256_storeu_si256(out + 16, _mm256_and_si256 ( mask, w1 ) ); + w0 = _mm256_lddqu_si256 (compressed + 10); + _mm256_storeu_si256(out + 17, + _mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w1 , 18) ,_mm256_slli_epi32( w0 , 14 ) ) ) ); + _mm256_storeu_si256(out + 18, _mm256_and_si256 ( mask, _mm256_srli_epi32( w0 , 4) ) ); + w1 = _mm256_lddqu_si256 (compressed + 11); + _mm256_storeu_si256(out + 19, + _mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w0 , 22) ,_mm256_slli_epi32( w1 , 10 ) ) ) ); + _mm256_storeu_si256(out + 20, _mm256_and_si256 ( mask, _mm256_srli_epi32( w1 , 8) ) ); + w0 = _mm256_lddqu_si256 (compressed + 12); + _mm256_storeu_si256(out + 21, + _mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w1 , 26) ,_mm256_slli_epi32( w0 , 6 ) ) ) ); + _mm256_storeu_si256(out + 22, _mm256_and_si256 ( mask, _mm256_srli_epi32( w0 , 12) ) ); + w1 = _mm256_lddqu_si256 (compressed + 13); + _mm256_storeu_si256(out + 23, + _mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w0 , 30) ,_mm256_slli_epi32( w1 , 2 ) ) ) ); + w0 = _mm256_lddqu_si256 (compressed + 14); + _mm256_storeu_si256(out + 24, + _mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w1 , 16) ,_mm256_slli_epi32( w0 , 16 ) ) ) ); + _mm256_storeu_si256(out + 25, _mm256_and_si256 ( mask, _mm256_srli_epi32( w0 , 2) ) ); + w1 = _mm256_lddqu_si256 (compressed + 15); + _mm256_storeu_si256(out + 26, + _mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w0 , 20) ,_mm256_slli_epi32( w1 , 12 ) ) ) ); + _mm256_storeu_si256(out + 27, _mm256_and_si256 ( mask, _mm256_srli_epi32( w1 , 6) ) ); + w0 = _mm256_lddqu_si256 (compressed + 16); + _mm256_storeu_si256(out + 28, + _mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w1 , 24) ,_mm256_slli_epi32( w0 , 8 ) ) ) ); + _mm256_storeu_si256(out + 29, _mm256_and_si256 ( mask, _mm256_srli_epi32( w0 , 10) ) ); + w1 = _mm256_lddqu_si256 (compressed + 17); + _mm256_storeu_si256(out + 30, + _mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w0 , 28) ,_mm256_slli_epi32( w1 , 4 ) ) ) ); + _mm256_storeu_si256(out + 31, _mm256_srli_epi32( w1 , 14) ); +} + + +/* we packed 256 19-bit values, touching 19 256-bit words, using 304 bytes */ +static void avxunpackblock19(const __m256i * compressed, uint32_t * pout) { + /* we are going to access 19 256-bit words */ + __m256i w0, w1; + __m256i * out = (__m256i *) pout; + const __m256i mask = _mm256_set1_epi32(524287); + w0 = _mm256_lddqu_si256 (compressed); + _mm256_storeu_si256(out + 0, _mm256_and_si256 ( mask, w0 ) ); + w1 = _mm256_lddqu_si256 (compressed + 1); + _mm256_storeu_si256(out + 1, + _mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w0 , 19) ,_mm256_slli_epi32( w1 , 13 ) ) ) ); + _mm256_storeu_si256(out + 2, _mm256_and_si256 ( mask, _mm256_srli_epi32( w1 , 6) ) ); + w0 = _mm256_lddqu_si256 (compressed + 2); + _mm256_storeu_si256(out + 3, + _mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w1 , 25) ,_mm256_slli_epi32( w0 , 7 ) ) ) ); + _mm256_storeu_si256(out + 4, _mm256_and_si256 ( mask, _mm256_srli_epi32( w0 , 12) ) ); + w1 = _mm256_lddqu_si256 (compressed + 3); + _mm256_storeu_si256(out + 5, + _mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w0 , 31) ,_mm256_slli_epi32( w1 , 1 ) ) ) ); + w0 = _mm256_lddqu_si256 (compressed + 4); + _mm256_storeu_si256(out + 6, + _mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w1 , 18) ,_mm256_slli_epi32( w0 , 14 ) ) ) ); + _mm256_storeu_si256(out + 7, _mm256_and_si256 ( mask, _mm256_srli_epi32( w0 , 5) ) ); + w1 = _mm256_lddqu_si256 (compressed + 5); + _mm256_storeu_si256(out + 8, + _mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w0 , 24) ,_mm256_slli_epi32( w1 , 8 ) ) ) ); + _mm256_storeu_si256(out + 9, _mm256_and_si256 ( mask, _mm256_srli_epi32( w1 , 11) ) ); + w0 = _mm256_lddqu_si256 (compressed + 6); + _mm256_storeu_si256(out + 10, + _mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w1 , 30) ,_mm256_slli_epi32( w0 , 2 ) ) ) ); + w1 = _mm256_lddqu_si256 (compressed + 7); + _mm256_storeu_si256(out + 11, + _mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w0 , 17) ,_mm256_slli_epi32( w1 , 15 ) ) ) ); + _mm256_storeu_si256(out + 12, _mm256_and_si256 ( mask, _mm256_srli_epi32( w1 , 4) ) ); + w0 = _mm256_lddqu_si256 (compressed + 8); + _mm256_storeu_si256(out + 13, + _mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w1 , 23) ,_mm256_slli_epi32( w0 , 9 ) ) ) ); + _mm256_storeu_si256(out + 14, _mm256_and_si256 ( mask, _mm256_srli_epi32( w0 , 10) ) ); + w1 = _mm256_lddqu_si256 (compressed + 9); + _mm256_storeu_si256(out + 15, + _mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w0 , 29) ,_mm256_slli_epi32( w1 , 3 ) ) ) ); + w0 = _mm256_lddqu_si256 (compressed + 10); + _mm256_storeu_si256(out + 16, + _mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w1 , 16) ,_mm256_slli_epi32( w0 , 16 ) ) ) ); + _mm256_storeu_si256(out + 17, _mm256_and_si256 ( mask, _mm256_srli_epi32( w0 , 3) ) ); + w1 = _mm256_lddqu_si256 (compressed + 11); + _mm256_storeu_si256(out + 18, + _mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w0 , 22) ,_mm256_slli_epi32( w1 , 10 ) ) ) ); + _mm256_storeu_si256(out + 19, _mm256_and_si256 ( mask, _mm256_srli_epi32( w1 , 9) ) ); + w0 = _mm256_lddqu_si256 (compressed + 12); + _mm256_storeu_si256(out + 20, + _mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w1 , 28) ,_mm256_slli_epi32( w0 , 4 ) ) ) ); + w1 = _mm256_lddqu_si256 (compressed + 13); + _mm256_storeu_si256(out + 21, + _mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w0 , 15) ,_mm256_slli_epi32( w1 , 17 ) ) ) ); + _mm256_storeu_si256(out + 22, _mm256_and_si256 ( mask, _mm256_srli_epi32( w1 , 2) ) ); + w0 = _mm256_lddqu_si256 (compressed + 14); + _mm256_storeu_si256(out + 23, + _mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w1 , 21) ,_mm256_slli_epi32( w0 , 11 ) ) ) ); + _mm256_storeu_si256(out + 24, _mm256_and_si256 ( mask, _mm256_srli_epi32( w0 , 8) ) ); + w1 = _mm256_lddqu_si256 (compressed + 15); + _mm256_storeu_si256(out + 25, + _mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w0 , 27) ,_mm256_slli_epi32( w1 , 5 ) ) ) ); + w0 = _mm256_lddqu_si256 (compressed + 16); + _mm256_storeu_si256(out + 26, + _mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w1 , 14) ,_mm256_slli_epi32( w0 , 18 ) ) ) ); + _mm256_storeu_si256(out + 27, _mm256_and_si256 ( mask, _mm256_srli_epi32( w0 , 1) ) ); + w1 = _mm256_lddqu_si256 (compressed + 17); + _mm256_storeu_si256(out + 28, + _mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w0 , 20) ,_mm256_slli_epi32( w1 , 12 ) ) ) ); + _mm256_storeu_si256(out + 29, _mm256_and_si256 ( mask, _mm256_srli_epi32( w1 , 7) ) ); + w0 = _mm256_lddqu_si256 (compressed + 18); + _mm256_storeu_si256(out + 30, + _mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w1 , 26) ,_mm256_slli_epi32( w0 , 6 ) ) ) ); + _mm256_storeu_si256(out + 31, _mm256_srli_epi32( w0 , 13) ); +} + + +/* we packed 256 20-bit values, touching 20 256-bit words, using 320 bytes */ +static void avxunpackblock20(const __m256i * compressed, uint32_t * pout) { + /* we are going to access 20 256-bit words */ + __m256i w0, w1; + __m256i * out = (__m256i *) pout; + const __m256i mask = _mm256_set1_epi32(1048575); + w0 = _mm256_lddqu_si256 (compressed); + _mm256_storeu_si256(out + 0, _mm256_and_si256 ( mask, w0 ) ); + w1 = _mm256_lddqu_si256 (compressed + 1); + _mm256_storeu_si256(out + 1, + _mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w0 , 20) ,_mm256_slli_epi32( w1 , 12 ) ) ) ); + _mm256_storeu_si256(out + 2, _mm256_and_si256 ( mask, _mm256_srli_epi32( w1 , 8) ) ); + w0 = _mm256_lddqu_si256 (compressed + 2); + _mm256_storeu_si256(out + 3, + _mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w1 , 28) ,_mm256_slli_epi32( w0 , 4 ) ) ) ); + w1 = _mm256_lddqu_si256 (compressed + 3); + _mm256_storeu_si256(out + 4, + _mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w0 , 16) ,_mm256_slli_epi32( w1 , 16 ) ) ) ); + _mm256_storeu_si256(out + 5, _mm256_and_si256 ( mask, _mm256_srli_epi32( w1 , 4) ) ); + w0 = _mm256_lddqu_si256 (compressed + 4); + _mm256_storeu_si256(out + 6, + _mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w1 , 24) ,_mm256_slli_epi32( w0 , 8 ) ) ) ); + _mm256_storeu_si256(out + 7, _mm256_srli_epi32( w0 , 12) ); + w1 = _mm256_lddqu_si256 (compressed + 5); + _mm256_storeu_si256(out + 8, _mm256_and_si256 ( mask, w1 ) ); + w0 = _mm256_lddqu_si256 (compressed + 6); + _mm256_storeu_si256(out + 9, + _mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w1 , 20) ,_mm256_slli_epi32( w0 , 12 ) ) ) ); + _mm256_storeu_si256(out + 10, _mm256_and_si256 ( mask, _mm256_srli_epi32( w0 , 8) ) ); + w1 = _mm256_lddqu_si256 (compressed + 7); + _mm256_storeu_si256(out + 11, + _mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w0 , 28) ,_mm256_slli_epi32( w1 , 4 ) ) ) ); + w0 = _mm256_lddqu_si256 (compressed + 8); + _mm256_storeu_si256(out + 12, + _mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w1 , 16) ,_mm256_slli_epi32( w0 , 16 ) ) ) ); + _mm256_storeu_si256(out + 13, _mm256_and_si256 ( mask, _mm256_srli_epi32( w0 , 4) ) ); + w1 = _mm256_lddqu_si256 (compressed + 9); + _mm256_storeu_si256(out + 14, + _mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w0 , 24) ,_mm256_slli_epi32( w1 , 8 ) ) ) ); + _mm256_storeu_si256(out + 15, _mm256_srli_epi32( w1 , 12) ); + w0 = _mm256_lddqu_si256 (compressed + 10); + _mm256_storeu_si256(out + 16, _mm256_and_si256 ( mask, w0 ) ); + w1 = _mm256_lddqu_si256 (compressed + 11); + _mm256_storeu_si256(out + 17, + _mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w0 , 20) ,_mm256_slli_epi32( w1 , 12 ) ) ) ); + _mm256_storeu_si256(out + 18, _mm256_and_si256 ( mask, _mm256_srli_epi32( w1 , 8) ) ); + w0 = _mm256_lddqu_si256 (compressed + 12); + _mm256_storeu_si256(out + 19, + _mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w1 , 28) ,_mm256_slli_epi32( w0 , 4 ) ) ) ); + w1 = _mm256_lddqu_si256 (compressed + 13); + _mm256_storeu_si256(out + 20, + _mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w0 , 16) ,_mm256_slli_epi32( w1 , 16 ) ) ) ); + _mm256_storeu_si256(out + 21, _mm256_and_si256 ( mask, _mm256_srli_epi32( w1 , 4) ) ); + w0 = _mm256_lddqu_si256 (compressed + 14); + _mm256_storeu_si256(out + 22, + _mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w1 , 24) ,_mm256_slli_epi32( w0 , 8 ) ) ) ); + _mm256_storeu_si256(out + 23, _mm256_srli_epi32( w0 , 12) ); + w1 = _mm256_lddqu_si256 (compressed + 15); + _mm256_storeu_si256(out + 24, _mm256_and_si256 ( mask, w1 ) ); + w0 = _mm256_lddqu_si256 (compressed + 16); + _mm256_storeu_si256(out + 25, + _mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w1 , 20) ,_mm256_slli_epi32( w0 , 12 ) ) ) ); + _mm256_storeu_si256(out + 26, _mm256_and_si256 ( mask, _mm256_srli_epi32( w0 , 8) ) ); + w1 = _mm256_lddqu_si256 (compressed + 17); + _mm256_storeu_si256(out + 27, + _mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w0 , 28) ,_mm256_slli_epi32( w1 , 4 ) ) ) ); + w0 = _mm256_lddqu_si256 (compressed + 18); + _mm256_storeu_si256(out + 28, + _mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w1 , 16) ,_mm256_slli_epi32( w0 , 16 ) ) ) ); + _mm256_storeu_si256(out + 29, _mm256_and_si256 ( mask, _mm256_srli_epi32( w0 , 4) ) ); + w1 = _mm256_lddqu_si256 (compressed + 19); + _mm256_storeu_si256(out + 30, + _mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w0 , 24) ,_mm256_slli_epi32( w1 , 8 ) ) ) ); + _mm256_storeu_si256(out + 31, _mm256_srli_epi32( w1 , 12) ); +} + + +/* we packed 256 21-bit values, touching 21 256-bit words, using 336 bytes */ +static void avxunpackblock21(const __m256i * compressed, uint32_t * pout) { + /* we are going to access 21 256-bit words */ + __m256i w0, w1; + __m256i * out = (__m256i *) pout; + const __m256i mask = _mm256_set1_epi32(2097151); + w0 = _mm256_lddqu_si256 (compressed); + _mm256_storeu_si256(out + 0, _mm256_and_si256 ( mask, w0 ) ); + w1 = _mm256_lddqu_si256 (compressed + 1); + _mm256_storeu_si256(out + 1, + _mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w0 , 21) ,_mm256_slli_epi32( w1 , 11 ) ) ) ); + _mm256_storeu_si256(out + 2, _mm256_and_si256 ( mask, _mm256_srli_epi32( w1 , 10) ) ); + w0 = _mm256_lddqu_si256 (compressed + 2); + _mm256_storeu_si256(out + 3, + _mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w1 , 31) ,_mm256_slli_epi32( w0 , 1 ) ) ) ); + w1 = _mm256_lddqu_si256 (compressed + 3); + _mm256_storeu_si256(out + 4, + _mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w0 , 20) ,_mm256_slli_epi32( w1 , 12 ) ) ) ); + _mm256_storeu_si256(out + 5, _mm256_and_si256 ( mask, _mm256_srli_epi32( w1 , 9) ) ); + w0 = _mm256_lddqu_si256 (compressed + 4); + _mm256_storeu_si256(out + 6, + _mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w1 , 30) ,_mm256_slli_epi32( w0 , 2 ) ) ) ); + w1 = _mm256_lddqu_si256 (compressed + 5); + _mm256_storeu_si256(out + 7, + _mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w0 , 19) ,_mm256_slli_epi32( w1 , 13 ) ) ) ); + _mm256_storeu_si256(out + 8, _mm256_and_si256 ( mask, _mm256_srli_epi32( w1 , 8) ) ); + w0 = _mm256_lddqu_si256 (compressed + 6); + _mm256_storeu_si256(out + 9, + _mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w1 , 29) ,_mm256_slli_epi32( w0 , 3 ) ) ) ); + w1 = _mm256_lddqu_si256 (compressed + 7); + _mm256_storeu_si256(out + 10, + _mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w0 , 18) ,_mm256_slli_epi32( w1 , 14 ) ) ) ); + _mm256_storeu_si256(out + 11, _mm256_and_si256 ( mask, _mm256_srli_epi32( w1 , 7) ) ); + w0 = _mm256_lddqu_si256 (compressed + 8); + _mm256_storeu_si256(out + 12, + _mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w1 , 28) ,_mm256_slli_epi32( w0 , 4 ) ) ) ); + w1 = _mm256_lddqu_si256 (compressed + 9); + _mm256_storeu_si256(out + 13, + _mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w0 , 17) ,_mm256_slli_epi32( w1 , 15 ) ) ) ); + _mm256_storeu_si256(out + 14, _mm256_and_si256 ( mask, _mm256_srli_epi32( w1 , 6) ) ); + w0 = _mm256_lddqu_si256 (compressed + 10); + _mm256_storeu_si256(out + 15, + _mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w1 , 27) ,_mm256_slli_epi32( w0 , 5 ) ) ) ); + w1 = _mm256_lddqu_si256 (compressed + 11); + _mm256_storeu_si256(out + 16, + _mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w0 , 16) ,_mm256_slli_epi32( w1 , 16 ) ) ) ); + _mm256_storeu_si256(out + 17, _mm256_and_si256 ( mask, _mm256_srli_epi32( w1 , 5) ) ); + w0 = _mm256_lddqu_si256 (compressed + 12); + _mm256_storeu_si256(out + 18, + _mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w1 , 26) ,_mm256_slli_epi32( w0 , 6 ) ) ) ); + w1 = _mm256_lddqu_si256 (compressed + 13); + _mm256_storeu_si256(out + 19, + _mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w0 , 15) ,_mm256_slli_epi32( w1 , 17 ) ) ) ); + _mm256_storeu_si256(out + 20, _mm256_and_si256 ( mask, _mm256_srli_epi32( w1 , 4) ) ); + w0 = _mm256_lddqu_si256 (compressed + 14); + _mm256_storeu_si256(out + 21, + _mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w1 , 25) ,_mm256_slli_epi32( w0 , 7 ) ) ) ); + w1 = _mm256_lddqu_si256 (compressed + 15); + _mm256_storeu_si256(out + 22, + _mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w0 , 14) ,_mm256_slli_epi32( w1 , 18 ) ) ) ); + _mm256_storeu_si256(out + 23, _mm256_and_si256 ( mask, _mm256_srli_epi32( w1 , 3) ) ); + w0 = _mm256_lddqu_si256 (compressed + 16); + _mm256_storeu_si256(out + 24, + _mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w1 , 24) ,_mm256_slli_epi32( w0 , 8 ) ) ) ); + w1 = _mm256_lddqu_si256 (compressed + 17); + _mm256_storeu_si256(out + 25, + _mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w0 , 13) ,_mm256_slli_epi32( w1 , 19 ) ) ) ); + _mm256_storeu_si256(out + 26, _mm256_and_si256 ( mask, _mm256_srli_epi32( w1 , 2) ) ); + w0 = _mm256_lddqu_si256 (compressed + 18); + _mm256_storeu_si256(out + 27, + _mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w1 , 23) ,_mm256_slli_epi32( w0 , 9 ) ) ) ); + w1 = _mm256_lddqu_si256 (compressed + 19); + _mm256_storeu_si256(out + 28, + _mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w0 , 12) ,_mm256_slli_epi32( w1 , 20 ) ) ) ); + _mm256_storeu_si256(out + 29, _mm256_and_si256 ( mask, _mm256_srli_epi32( w1 , 1) ) ); + w0 = _mm256_lddqu_si256 (compressed + 20); + _mm256_storeu_si256(out + 30, + _mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w1 , 22) ,_mm256_slli_epi32( w0 , 10 ) ) ) ); + _mm256_storeu_si256(out + 31, _mm256_srli_epi32( w0 , 11) ); +} + + +/* we packed 256 22-bit values, touching 22 256-bit words, using 352 bytes */ +static void avxunpackblock22(const __m256i * compressed, uint32_t * pout) { + /* we are going to access 22 256-bit words */ + __m256i w0, w1; + __m256i * out = (__m256i *) pout; + const __m256i mask = _mm256_set1_epi32(4194303); + w0 = _mm256_lddqu_si256 (compressed); + _mm256_storeu_si256(out + 0, _mm256_and_si256 ( mask, w0 ) ); + w1 = _mm256_lddqu_si256 (compressed + 1); + _mm256_storeu_si256(out + 1, + _mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w0 , 22) ,_mm256_slli_epi32( w1 , 10 ) ) ) ); + w0 = _mm256_lddqu_si256 (compressed + 2); + _mm256_storeu_si256(out + 2, + _mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w1 , 12) ,_mm256_slli_epi32( w0 , 20 ) ) ) ); + _mm256_storeu_si256(out + 3, _mm256_and_si256 ( mask, _mm256_srli_epi32( w0 , 2) ) ); + w1 = _mm256_lddqu_si256 (compressed + 3); + _mm256_storeu_si256(out + 4, + _mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w0 , 24) ,_mm256_slli_epi32( w1 , 8 ) ) ) ); + w0 = _mm256_lddqu_si256 (compressed + 4); + _mm256_storeu_si256(out + 5, + _mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w1 , 14) ,_mm256_slli_epi32( w0 , 18 ) ) ) ); + _mm256_storeu_si256(out + 6, _mm256_and_si256 ( mask, _mm256_srli_epi32( w0 , 4) ) ); + w1 = _mm256_lddqu_si256 (compressed + 5); + _mm256_storeu_si256(out + 7, + _mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w0 , 26) ,_mm256_slli_epi32( w1 , 6 ) ) ) ); + w0 = _mm256_lddqu_si256 (compressed + 6); + _mm256_storeu_si256(out + 8, + _mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w1 , 16) ,_mm256_slli_epi32( w0 , 16 ) ) ) ); + _mm256_storeu_si256(out + 9, _mm256_and_si256 ( mask, _mm256_srli_epi32( w0 , 6) ) ); + w1 = _mm256_lddqu_si256 (compressed + 7); + _mm256_storeu_si256(out + 10, + _mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w0 , 28) ,_mm256_slli_epi32( w1 , 4 ) ) ) ); + w0 = _mm256_lddqu_si256 (compressed + 8); + _mm256_storeu_si256(out + 11, + _mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w1 , 18) ,_mm256_slli_epi32( w0 , 14 ) ) ) ); + _mm256_storeu_si256(out + 12, _mm256_and_si256 ( mask, _mm256_srli_epi32( w0 , 8) ) ); + w1 = _mm256_lddqu_si256 (compressed + 9); + _mm256_storeu_si256(out + 13, + _mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w0 , 30) ,_mm256_slli_epi32( w1 , 2 ) ) ) ); + w0 = _mm256_lddqu_si256 (compressed + 10); + _mm256_storeu_si256(out + 14, + _mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w1 , 20) ,_mm256_slli_epi32( w0 , 12 ) ) ) ); + _mm256_storeu_si256(out + 15, _mm256_srli_epi32( w0 , 10) ); + w1 = _mm256_lddqu_si256 (compressed + 11); + _mm256_storeu_si256(out + 16, _mm256_and_si256 ( mask, w1 ) ); + w0 = _mm256_lddqu_si256 (compressed + 12); + _mm256_storeu_si256(out + 17, + _mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w1 , 22) ,_mm256_slli_epi32( w0 , 10 ) ) ) ); + w1 = _mm256_lddqu_si256 (compressed + 13); + _mm256_storeu_si256(out + 18, + _mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w0 , 12) ,_mm256_slli_epi32( w1 , 20 ) ) ) ); + _mm256_storeu_si256(out + 19, _mm256_and_si256 ( mask, _mm256_srli_epi32( w1 , 2) ) ); + w0 = _mm256_lddqu_si256 (compressed + 14); + _mm256_storeu_si256(out + 20, + _mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w1 , 24) ,_mm256_slli_epi32( w0 , 8 ) ) ) ); + w1 = _mm256_lddqu_si256 (compressed + 15); + _mm256_storeu_si256(out + 21, + _mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w0 , 14) ,_mm256_slli_epi32( w1 , 18 ) ) ) ); + _mm256_storeu_si256(out + 22, _mm256_and_si256 ( mask, _mm256_srli_epi32( w1 , 4) ) ); + w0 = _mm256_lddqu_si256 (compressed + 16); + _mm256_storeu_si256(out + 23, + _mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w1 , 26) ,_mm256_slli_epi32( w0 , 6 ) ) ) ); + w1 = _mm256_lddqu_si256 (compressed + 17); + _mm256_storeu_si256(out + 24, + _mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w0 , 16) ,_mm256_slli_epi32( w1 , 16 ) ) ) ); + _mm256_storeu_si256(out + 25, _mm256_and_si256 ( mask, _mm256_srli_epi32( w1 , 6) ) ); + w0 = _mm256_lddqu_si256 (compressed + 18); + _mm256_storeu_si256(out + 26, + _mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w1 , 28) ,_mm256_slli_epi32( w0 , 4 ) ) ) ); + w1 = _mm256_lddqu_si256 (compressed + 19); + _mm256_storeu_si256(out + 27, + _mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w0 , 18) ,_mm256_slli_epi32( w1 , 14 ) ) ) ); + _mm256_storeu_si256(out + 28, _mm256_and_si256 ( mask, _mm256_srli_epi32( w1 , 8) ) ); + w0 = _mm256_lddqu_si256 (compressed + 20); + _mm256_storeu_si256(out + 29, + _mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w1 , 30) ,_mm256_slli_epi32( w0 , 2 ) ) ) ); + w1 = _mm256_lddqu_si256 (compressed + 21); + _mm256_storeu_si256(out + 30, + _mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w0 , 20) ,_mm256_slli_epi32( w1 , 12 ) ) ) ); + _mm256_storeu_si256(out + 31, _mm256_srli_epi32( w1 , 10) ); +} + + +/* we packed 256 23-bit values, touching 23 256-bit words, using 368 bytes */ +static void avxunpackblock23(const __m256i * compressed, uint32_t * pout) { + /* we are going to access 23 256-bit words */ + __m256i w0, w1; + __m256i * out = (__m256i *) pout; + const __m256i mask = _mm256_set1_epi32(8388607); + w0 = _mm256_lddqu_si256 (compressed); + _mm256_storeu_si256(out + 0, _mm256_and_si256 ( mask, w0 ) ); + w1 = _mm256_lddqu_si256 (compressed + 1); + _mm256_storeu_si256(out + 1, + _mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w0 , 23) ,_mm256_slli_epi32( w1 , 9 ) ) ) ); + w0 = _mm256_lddqu_si256 (compressed + 2); + _mm256_storeu_si256(out + 2, + _mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w1 , 14) ,_mm256_slli_epi32( w0 , 18 ) ) ) ); + _mm256_storeu_si256(out + 3, _mm256_and_si256 ( mask, _mm256_srli_epi32( w0 , 5) ) ); + w1 = _mm256_lddqu_si256 (compressed + 3); + _mm256_storeu_si256(out + 4, + _mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w0 , 28) ,_mm256_slli_epi32( w1 , 4 ) ) ) ); + w0 = _mm256_lddqu_si256 (compressed + 4); + _mm256_storeu_si256(out + 5, + _mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w1 , 19) ,_mm256_slli_epi32( w0 , 13 ) ) ) ); + w1 = _mm256_lddqu_si256 (compressed + 5); + _mm256_storeu_si256(out + 6, + _mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w0 , 10) ,_mm256_slli_epi32( w1 , 22 ) ) ) ); + _mm256_storeu_si256(out + 7, _mm256_and_si256 ( mask, _mm256_srli_epi32( w1 , 1) ) ); + w0 = _mm256_lddqu_si256 (compressed + 6); + _mm256_storeu_si256(out + 8, + _mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w1 , 24) ,_mm256_slli_epi32( w0 , 8 ) ) ) ); + w1 = _mm256_lddqu_si256 (compressed + 7); + _mm256_storeu_si256(out + 9, + _mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w0 , 15) ,_mm256_slli_epi32( w1 , 17 ) ) ) ); + _mm256_storeu_si256(out + 10, _mm256_and_si256 ( mask, _mm256_srli_epi32( w1 , 6) ) ); + w0 = _mm256_lddqu_si256 (compressed + 8); + _mm256_storeu_si256(out + 11, + _mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w1 , 29) ,_mm256_slli_epi32( w0 , 3 ) ) ) ); + w1 = _mm256_lddqu_si256 (compressed + 9); + _mm256_storeu_si256(out + 12, + _mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w0 , 20) ,_mm256_slli_epi32( w1 , 12 ) ) ) ); + w0 = _mm256_lddqu_si256 (compressed + 10); + _mm256_storeu_si256(out + 13, + _mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w1 , 11) ,_mm256_slli_epi32( w0 , 21 ) ) ) ); + _mm256_storeu_si256(out + 14, _mm256_and_si256 ( mask, _mm256_srli_epi32( w0 , 2) ) ); + w1 = _mm256_lddqu_si256 (compressed + 11); + _mm256_storeu_si256(out + 15, + _mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w0 , 25) ,_mm256_slli_epi32( w1 , 7 ) ) ) ); + w0 = _mm256_lddqu_si256 (compressed + 12); + _mm256_storeu_si256(out + 16, + _mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w1 , 16) ,_mm256_slli_epi32( w0 , 16 ) ) ) ); + _mm256_storeu_si256(out + 17, _mm256_and_si256 ( mask, _mm256_srli_epi32( w0 , 7) ) ); + w1 = _mm256_lddqu_si256 (compressed + 13); + _mm256_storeu_si256(out + 18, + _mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w0 , 30) ,_mm256_slli_epi32( w1 , 2 ) ) ) ); + w0 = _mm256_lddqu_si256 (compressed + 14); + _mm256_storeu_si256(out + 19, + _mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w1 , 21) ,_mm256_slli_epi32( w0 , 11 ) ) ) ); + w1 = _mm256_lddqu_si256 (compressed + 15); + _mm256_storeu_si256(out + 20, + _mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w0 , 12) ,_mm256_slli_epi32( w1 , 20 ) ) ) ); + _mm256_storeu_si256(out + 21, _mm256_and_si256 ( mask, _mm256_srli_epi32( w1 , 3) ) ); + w0 = _mm256_lddqu_si256 (compressed + 16); + _mm256_storeu_si256(out + 22, + _mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w1 , 26) ,_mm256_slli_epi32( w0 , 6 ) ) ) ); + w1 = _mm256_lddqu_si256 (compressed + 17); + _mm256_storeu_si256(out + 23, + _mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w0 , 17) ,_mm256_slli_epi32( w1 , 15 ) ) ) ); + _mm256_storeu_si256(out + 24, _mm256_and_si256 ( mask, _mm256_srli_epi32( w1 , 8) ) ); + w0 = _mm256_lddqu_si256 (compressed + 18); + _mm256_storeu_si256(out + 25, + _mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w1 , 31) ,_mm256_slli_epi32( w0 , 1 ) ) ) ); + w1 = _mm256_lddqu_si256 (compressed + 19); + _mm256_storeu_si256(out + 26, + _mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w0 , 22) ,_mm256_slli_epi32( w1 , 10 ) ) ) ); + w0 = _mm256_lddqu_si256 (compressed + 20); + _mm256_storeu_si256(out + 27, + _mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w1 , 13) ,_mm256_slli_epi32( w0 , 19 ) ) ) ); + _mm256_storeu_si256(out + 28, _mm256_and_si256 ( mask, _mm256_srli_epi32( w0 , 4) ) ); + w1 = _mm256_lddqu_si256 (compressed + 21); + _mm256_storeu_si256(out + 29, + _mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w0 , 27) ,_mm256_slli_epi32( w1 , 5 ) ) ) ); + w0 = _mm256_lddqu_si256 (compressed + 22); + _mm256_storeu_si256(out + 30, + _mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w1 , 18) ,_mm256_slli_epi32( w0 , 14 ) ) ) ); + _mm256_storeu_si256(out + 31, _mm256_srli_epi32( w0 , 9) ); +} + + +/* we packed 256 24-bit values, touching 24 256-bit words, using 384 bytes */ +static void avxunpackblock24(const __m256i * compressed, uint32_t * pout) { + /* we are going to access 24 256-bit words */ + __m256i w0, w1; + __m256i * out = (__m256i *) pout; + const __m256i mask = _mm256_set1_epi32(16777215); + w0 = _mm256_lddqu_si256 (compressed); + _mm256_storeu_si256(out + 0, _mm256_and_si256 ( mask, w0 ) ); + w1 = _mm256_lddqu_si256 (compressed + 1); + _mm256_storeu_si256(out + 1, + _mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w0 , 24) ,_mm256_slli_epi32( w1 , 8 ) ) ) ); + w0 = _mm256_lddqu_si256 (compressed + 2); + _mm256_storeu_si256(out + 2, + _mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w1 , 16) ,_mm256_slli_epi32( w0 , 16 ) ) ) ); + _mm256_storeu_si256(out + 3, _mm256_srli_epi32( w0 , 8) ); + w1 = _mm256_lddqu_si256 (compressed + 3); + _mm256_storeu_si256(out + 4, _mm256_and_si256 ( mask, w1 ) ); + w0 = _mm256_lddqu_si256 (compressed + 4); + _mm256_storeu_si256(out + 5, + _mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w1 , 24) ,_mm256_slli_epi32( w0 , 8 ) ) ) ); + w1 = _mm256_lddqu_si256 (compressed + 5); + _mm256_storeu_si256(out + 6, + _mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w0 , 16) ,_mm256_slli_epi32( w1 , 16 ) ) ) ); + _mm256_storeu_si256(out + 7, _mm256_srli_epi32( w1 , 8) ); + w0 = _mm256_lddqu_si256 (compressed + 6); + _mm256_storeu_si256(out + 8, _mm256_and_si256 ( mask, w0 ) ); + w1 = _mm256_lddqu_si256 (compressed + 7); + _mm256_storeu_si256(out + 9, + _mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w0 , 24) ,_mm256_slli_epi32( w1 , 8 ) ) ) ); + w0 = _mm256_lddqu_si256 (compressed + 8); + _mm256_storeu_si256(out + 10, + _mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w1 , 16) ,_mm256_slli_epi32( w0 , 16 ) ) ) ); + _mm256_storeu_si256(out + 11, _mm256_srli_epi32( w0 , 8) ); + w1 = _mm256_lddqu_si256 (compressed + 9); + _mm256_storeu_si256(out + 12, _mm256_and_si256 ( mask, w1 ) ); + w0 = _mm256_lddqu_si256 (compressed + 10); + _mm256_storeu_si256(out + 13, + _mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w1 , 24) ,_mm256_slli_epi32( w0 , 8 ) ) ) ); + w1 = _mm256_lddqu_si256 (compressed + 11); + _mm256_storeu_si256(out + 14, + _mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w0 , 16) ,_mm256_slli_epi32( w1 , 16 ) ) ) ); + _mm256_storeu_si256(out + 15, _mm256_srli_epi32( w1 , 8) ); + w0 = _mm256_lddqu_si256 (compressed + 12); + _mm256_storeu_si256(out + 16, _mm256_and_si256 ( mask, w0 ) ); + w1 = _mm256_lddqu_si256 (compressed + 13); + _mm256_storeu_si256(out + 17, + _mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w0 , 24) ,_mm256_slli_epi32( w1 , 8 ) ) ) ); + w0 = _mm256_lddqu_si256 (compressed + 14); + _mm256_storeu_si256(out + 18, + _mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w1 , 16) ,_mm256_slli_epi32( w0 , 16 ) ) ) ); + _mm256_storeu_si256(out + 19, _mm256_srli_epi32( w0 , 8) ); + w1 = _mm256_lddqu_si256 (compressed + 15); + _mm256_storeu_si256(out + 20, _mm256_and_si256 ( mask, w1 ) ); + w0 = _mm256_lddqu_si256 (compressed + 16); + _mm256_storeu_si256(out + 21, + _mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w1 , 24) ,_mm256_slli_epi32( w0 , 8 ) ) ) ); + w1 = _mm256_lddqu_si256 (compressed + 17); + _mm256_storeu_si256(out + 22, + _mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w0 , 16) ,_mm256_slli_epi32( w1 , 16 ) ) ) ); + _mm256_storeu_si256(out + 23, _mm256_srli_epi32( w1 , 8) ); + w0 = _mm256_lddqu_si256 (compressed + 18); + _mm256_storeu_si256(out + 24, _mm256_and_si256 ( mask, w0 ) ); + w1 = _mm256_lddqu_si256 (compressed + 19); + _mm256_storeu_si256(out + 25, + _mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w0 , 24) ,_mm256_slli_epi32( w1 , 8 ) ) ) ); + w0 = _mm256_lddqu_si256 (compressed + 20); + _mm256_storeu_si256(out + 26, + _mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w1 , 16) ,_mm256_slli_epi32( w0 , 16 ) ) ) ); + _mm256_storeu_si256(out + 27, _mm256_srli_epi32( w0 , 8) ); + w1 = _mm256_lddqu_si256 (compressed + 21); + _mm256_storeu_si256(out + 28, _mm256_and_si256 ( mask, w1 ) ); + w0 = _mm256_lddqu_si256 (compressed + 22); + _mm256_storeu_si256(out + 29, + _mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w1 , 24) ,_mm256_slli_epi32( w0 , 8 ) ) ) ); + w1 = _mm256_lddqu_si256 (compressed + 23); + _mm256_storeu_si256(out + 30, + _mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w0 , 16) ,_mm256_slli_epi32( w1 , 16 ) ) ) ); + _mm256_storeu_si256(out + 31, _mm256_srli_epi32( w1 , 8) ); +} + + +/* we packed 256 25-bit values, touching 25 256-bit words, using 400 bytes */ +static void avxunpackblock25(const __m256i * compressed, uint32_t * pout) { + /* we are going to access 25 256-bit words */ + __m256i w0, w1; + __m256i * out = (__m256i *) pout; + const __m256i mask = _mm256_set1_epi32(33554431); + w0 = _mm256_lddqu_si256 (compressed); + _mm256_storeu_si256(out + 0, _mm256_and_si256 ( mask, w0 ) ); + w1 = _mm256_lddqu_si256 (compressed + 1); + _mm256_storeu_si256(out + 1, + _mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w0 , 25) ,_mm256_slli_epi32( w1 , 7 ) ) ) ); + w0 = _mm256_lddqu_si256 (compressed + 2); + _mm256_storeu_si256(out + 2, + _mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w1 , 18) ,_mm256_slli_epi32( w0 , 14 ) ) ) ); + w1 = _mm256_lddqu_si256 (compressed + 3); + _mm256_storeu_si256(out + 3, + _mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w0 , 11) ,_mm256_slli_epi32( w1 , 21 ) ) ) ); + _mm256_storeu_si256(out + 4, _mm256_and_si256 ( mask, _mm256_srli_epi32( w1 , 4) ) ); + w0 = _mm256_lddqu_si256 (compressed + 4); + _mm256_storeu_si256(out + 5, + _mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w1 , 29) ,_mm256_slli_epi32( w0 , 3 ) ) ) ); + w1 = _mm256_lddqu_si256 (compressed + 5); + _mm256_storeu_si256(out + 6, + _mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w0 , 22) ,_mm256_slli_epi32( w1 , 10 ) ) ) ); + w0 = _mm256_lddqu_si256 (compressed + 6); + _mm256_storeu_si256(out + 7, + _mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w1 , 15) ,_mm256_slli_epi32( w0 , 17 ) ) ) ); + w1 = _mm256_lddqu_si256 (compressed + 7); + _mm256_storeu_si256(out + 8, + _mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w0 , 8) ,_mm256_slli_epi32( w1 , 24 ) ) ) ); + _mm256_storeu_si256(out + 9, _mm256_and_si256 ( mask, _mm256_srli_epi32( w1 , 1) ) ); + w0 = _mm256_lddqu_si256 (compressed + 8); + _mm256_storeu_si256(out + 10, + _mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w1 , 26) ,_mm256_slli_epi32( w0 , 6 ) ) ) ); + w1 = _mm256_lddqu_si256 (compressed + 9); + _mm256_storeu_si256(out + 11, + _mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w0 , 19) ,_mm256_slli_epi32( w1 , 13 ) ) ) ); + w0 = _mm256_lddqu_si256 (compressed + 10); + _mm256_storeu_si256(out + 12, + _mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w1 , 12) ,_mm256_slli_epi32( w0 , 20 ) ) ) ); + _mm256_storeu_si256(out + 13, _mm256_and_si256 ( mask, _mm256_srli_epi32( w0 , 5) ) ); + w1 = _mm256_lddqu_si256 (compressed + 11); + _mm256_storeu_si256(out + 14, + _mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w0 , 30) ,_mm256_slli_epi32( w1 , 2 ) ) ) ); + w0 = _mm256_lddqu_si256 (compressed + 12); + _mm256_storeu_si256(out + 15, + _mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w1 , 23) ,_mm256_slli_epi32( w0 , 9 ) ) ) ); + w1 = _mm256_lddqu_si256 (compressed + 13); + _mm256_storeu_si256(out + 16, + _mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w0 , 16) ,_mm256_slli_epi32( w1 , 16 ) ) ) ); + w0 = _mm256_lddqu_si256 (compressed + 14); + _mm256_storeu_si256(out + 17, + _mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w1 , 9) ,_mm256_slli_epi32( w0 , 23 ) ) ) ); + _mm256_storeu_si256(out + 18, _mm256_and_si256 ( mask, _mm256_srli_epi32( w0 , 2) ) ); + w1 = _mm256_lddqu_si256 (compressed + 15); + _mm256_storeu_si256(out + 19, + _mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w0 , 27) ,_mm256_slli_epi32( w1 , 5 ) ) ) ); + w0 = _mm256_lddqu_si256 (compressed + 16); + _mm256_storeu_si256(out + 20, + _mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w1 , 20) ,_mm256_slli_epi32( w0 , 12 ) ) ) ); + w1 = _mm256_lddqu_si256 (compressed + 17); + _mm256_storeu_si256(out + 21, + _mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w0 , 13) ,_mm256_slli_epi32( w1 , 19 ) ) ) ); + _mm256_storeu_si256(out + 22, _mm256_and_si256 ( mask, _mm256_srli_epi32( w1 , 6) ) ); + w0 = _mm256_lddqu_si256 (compressed + 18); + _mm256_storeu_si256(out + 23, + _mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w1 , 31) ,_mm256_slli_epi32( w0 , 1 ) ) ) ); + w1 = _mm256_lddqu_si256 (compressed + 19); + _mm256_storeu_si256(out + 24, + _mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w0 , 24) ,_mm256_slli_epi32( w1 , 8 ) ) ) ); + w0 = _mm256_lddqu_si256 (compressed + 20); + _mm256_storeu_si256(out + 25, + _mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w1 , 17) ,_mm256_slli_epi32( w0 , 15 ) ) ) ); + w1 = _mm256_lddqu_si256 (compressed + 21); + _mm256_storeu_si256(out + 26, + _mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w0 , 10) ,_mm256_slli_epi32( w1 , 22 ) ) ) ); + _mm256_storeu_si256(out + 27, _mm256_and_si256 ( mask, _mm256_srli_epi32( w1 , 3) ) ); + w0 = _mm256_lddqu_si256 (compressed + 22); + _mm256_storeu_si256(out + 28, + _mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w1 , 28) ,_mm256_slli_epi32( w0 , 4 ) ) ) ); + w1 = _mm256_lddqu_si256 (compressed + 23); + _mm256_storeu_si256(out + 29, + _mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w0 , 21) ,_mm256_slli_epi32( w1 , 11 ) ) ) ); + w0 = _mm256_lddqu_si256 (compressed + 24); + _mm256_storeu_si256(out + 30, + _mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w1 , 14) ,_mm256_slli_epi32( w0 , 18 ) ) ) ); + _mm256_storeu_si256(out + 31, _mm256_srli_epi32( w0 , 7) ); +} + + +/* we packed 256 26-bit values, touching 26 256-bit words, using 416 bytes */ +static void avxunpackblock26(const __m256i * compressed, uint32_t * pout) { + /* we are going to access 26 256-bit words */ + __m256i w0, w1; + __m256i * out = (__m256i *) pout; + const __m256i mask = _mm256_set1_epi32(67108863); + w0 = _mm256_lddqu_si256 (compressed); + _mm256_storeu_si256(out + 0, _mm256_and_si256 ( mask, w0 ) ); + w1 = _mm256_lddqu_si256 (compressed + 1); + _mm256_storeu_si256(out + 1, + _mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w0 , 26) ,_mm256_slli_epi32( w1 , 6 ) ) ) ); + w0 = _mm256_lddqu_si256 (compressed + 2); + _mm256_storeu_si256(out + 2, + _mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w1 , 20) ,_mm256_slli_epi32( w0 , 12 ) ) ) ); + w1 = _mm256_lddqu_si256 (compressed + 3); + _mm256_storeu_si256(out + 3, + _mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w0 , 14) ,_mm256_slli_epi32( w1 , 18 ) ) ) ); + w0 = _mm256_lddqu_si256 (compressed + 4); + _mm256_storeu_si256(out + 4, + _mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w1 , 8) ,_mm256_slli_epi32( w0 , 24 ) ) ) ); + _mm256_storeu_si256(out + 5, _mm256_and_si256 ( mask, _mm256_srli_epi32( w0 , 2) ) ); + w1 = _mm256_lddqu_si256 (compressed + 5); + _mm256_storeu_si256(out + 6, + _mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w0 , 28) ,_mm256_slli_epi32( w1 , 4 ) ) ) ); + w0 = _mm256_lddqu_si256 (compressed + 6); + _mm256_storeu_si256(out + 7, + _mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w1 , 22) ,_mm256_slli_epi32( w0 , 10 ) ) ) ); + w1 = _mm256_lddqu_si256 (compressed + 7); + _mm256_storeu_si256(out + 8, + _mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w0 , 16) ,_mm256_slli_epi32( w1 , 16 ) ) ) ); + w0 = _mm256_lddqu_si256 (compressed + 8); + _mm256_storeu_si256(out + 9, + _mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w1 , 10) ,_mm256_slli_epi32( w0 , 22 ) ) ) ); + _mm256_storeu_si256(out + 10, _mm256_and_si256 ( mask, _mm256_srli_epi32( w0 , 4) ) ); + w1 = _mm256_lddqu_si256 (compressed + 9); + _mm256_storeu_si256(out + 11, + _mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w0 , 30) ,_mm256_slli_epi32( w1 , 2 ) ) ) ); + w0 = _mm256_lddqu_si256 (compressed + 10); + _mm256_storeu_si256(out + 12, + _mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w1 , 24) ,_mm256_slli_epi32( w0 , 8 ) ) ) ); + w1 = _mm256_lddqu_si256 (compressed + 11); + _mm256_storeu_si256(out + 13, + _mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w0 , 18) ,_mm256_slli_epi32( w1 , 14 ) ) ) ); + w0 = _mm256_lddqu_si256 (compressed + 12); + _mm256_storeu_si256(out + 14, + _mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w1 , 12) ,_mm256_slli_epi32( w0 , 20 ) ) ) ); + _mm256_storeu_si256(out + 15, _mm256_srli_epi32( w0 , 6) ); + w1 = _mm256_lddqu_si256 (compressed + 13); + _mm256_storeu_si256(out + 16, _mm256_and_si256 ( mask, w1 ) ); + w0 = _mm256_lddqu_si256 (compressed + 14); + _mm256_storeu_si256(out + 17, + _mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w1 , 26) ,_mm256_slli_epi32( w0 , 6 ) ) ) ); + w1 = _mm256_lddqu_si256 (compressed + 15); + _mm256_storeu_si256(out + 18, + _mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w0 , 20) ,_mm256_slli_epi32( w1 , 12 ) ) ) ); + w0 = _mm256_lddqu_si256 (compressed + 16); + _mm256_storeu_si256(out + 19, + _mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w1 , 14) ,_mm256_slli_epi32( w0 , 18 ) ) ) ); + w1 = _mm256_lddqu_si256 (compressed + 17); + _mm256_storeu_si256(out + 20, + _mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w0 , 8) ,_mm256_slli_epi32( w1 , 24 ) ) ) ); + _mm256_storeu_si256(out + 21, _mm256_and_si256 ( mask, _mm256_srli_epi32( w1 , 2) ) ); + w0 = _mm256_lddqu_si256 (compressed + 18); + _mm256_storeu_si256(out + 22, + _mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w1 , 28) ,_mm256_slli_epi32( w0 , 4 ) ) ) ); + w1 = _mm256_lddqu_si256 (compressed + 19); + _mm256_storeu_si256(out + 23, + _mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w0 , 22) ,_mm256_slli_epi32( w1 , 10 ) ) ) ); + w0 = _mm256_lddqu_si256 (compressed + 20); + _mm256_storeu_si256(out + 24, + _mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w1 , 16) ,_mm256_slli_epi32( w0 , 16 ) ) ) ); + w1 = _mm256_lddqu_si256 (compressed + 21); + _mm256_storeu_si256(out + 25, + _mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w0 , 10) ,_mm256_slli_epi32( w1 , 22 ) ) ) ); + _mm256_storeu_si256(out + 26, _mm256_and_si256 ( mask, _mm256_srli_epi32( w1 , 4) ) ); + w0 = _mm256_lddqu_si256 (compressed + 22); + _mm256_storeu_si256(out + 27, + _mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w1 , 30) ,_mm256_slli_epi32( w0 , 2 ) ) ) ); + w1 = _mm256_lddqu_si256 (compressed + 23); + _mm256_storeu_si256(out + 28, + _mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w0 , 24) ,_mm256_slli_epi32( w1 , 8 ) ) ) ); + w0 = _mm256_lddqu_si256 (compressed + 24); + _mm256_storeu_si256(out + 29, + _mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w1 , 18) ,_mm256_slli_epi32( w0 , 14 ) ) ) ); + w1 = _mm256_lddqu_si256 (compressed + 25); + _mm256_storeu_si256(out + 30, + _mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w0 , 12) ,_mm256_slli_epi32( w1 , 20 ) ) ) ); + _mm256_storeu_si256(out + 31, _mm256_srli_epi32( w1 , 6) ); +} + + +/* we packed 256 27-bit values, touching 27 256-bit words, using 432 bytes */ +static void avxunpackblock27(const __m256i * compressed, uint32_t * pout) { + /* we are going to access 27 256-bit words */ + __m256i w0, w1; + __m256i * out = (__m256i *) pout; + const __m256i mask = _mm256_set1_epi32(134217727); + w0 = _mm256_lddqu_si256 (compressed); + _mm256_storeu_si256(out + 0, _mm256_and_si256 ( mask, w0 ) ); + w1 = _mm256_lddqu_si256 (compressed + 1); + _mm256_storeu_si256(out + 1, + _mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w0 , 27) ,_mm256_slli_epi32( w1 , 5 ) ) ) ); + w0 = _mm256_lddqu_si256 (compressed + 2); + _mm256_storeu_si256(out + 2, + _mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w1 , 22) ,_mm256_slli_epi32( w0 , 10 ) ) ) ); + w1 = _mm256_lddqu_si256 (compressed + 3); + _mm256_storeu_si256(out + 3, + _mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w0 , 17) ,_mm256_slli_epi32( w1 , 15 ) ) ) ); + w0 = _mm256_lddqu_si256 (compressed + 4); + _mm256_storeu_si256(out + 4, + _mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w1 , 12) ,_mm256_slli_epi32( w0 , 20 ) ) ) ); + w1 = _mm256_lddqu_si256 (compressed + 5); + _mm256_storeu_si256(out + 5, + _mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w0 , 7) ,_mm256_slli_epi32( w1 , 25 ) ) ) ); + _mm256_storeu_si256(out + 6, _mm256_and_si256 ( mask, _mm256_srli_epi32( w1 , 2) ) ); + w0 = _mm256_lddqu_si256 (compressed + 6); + _mm256_storeu_si256(out + 7, + _mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w1 , 29) ,_mm256_slli_epi32( w0 , 3 ) ) ) ); + w1 = _mm256_lddqu_si256 (compressed + 7); + _mm256_storeu_si256(out + 8, + _mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w0 , 24) ,_mm256_slli_epi32( w1 , 8 ) ) ) ); + w0 = _mm256_lddqu_si256 (compressed + 8); + _mm256_storeu_si256(out + 9, + _mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w1 , 19) ,_mm256_slli_epi32( w0 , 13 ) ) ) ); + w1 = _mm256_lddqu_si256 (compressed + 9); + _mm256_storeu_si256(out + 10, + _mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w0 , 14) ,_mm256_slli_epi32( w1 , 18 ) ) ) ); + w0 = _mm256_lddqu_si256 (compressed + 10); + _mm256_storeu_si256(out + 11, + _mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w1 , 9) ,_mm256_slli_epi32( w0 , 23 ) ) ) ); + _mm256_storeu_si256(out + 12, _mm256_and_si256 ( mask, _mm256_srli_epi32( w0 , 4) ) ); + w1 = _mm256_lddqu_si256 (compressed + 11); + _mm256_storeu_si256(out + 13, + _mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w0 , 31) ,_mm256_slli_epi32( w1 , 1 ) ) ) ); + w0 = _mm256_lddqu_si256 (compressed + 12); + _mm256_storeu_si256(out + 14, + _mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w1 , 26) ,_mm256_slli_epi32( w0 , 6 ) ) ) ); + w1 = _mm256_lddqu_si256 (compressed + 13); + _mm256_storeu_si256(out + 15, + _mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w0 , 21) ,_mm256_slli_epi32( w1 , 11 ) ) ) ); + w0 = _mm256_lddqu_si256 (compressed + 14); + _mm256_storeu_si256(out + 16, + _mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w1 , 16) ,_mm256_slli_epi32( w0 , 16 ) ) ) ); + w1 = _mm256_lddqu_si256 (compressed + 15); + _mm256_storeu_si256(out + 17, + _mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w0 , 11) ,_mm256_slli_epi32( w1 , 21 ) ) ) ); + w0 = _mm256_lddqu_si256 (compressed + 16); + _mm256_storeu_si256(out + 18, + _mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w1 , 6) ,_mm256_slli_epi32( w0 , 26 ) ) ) ); + _mm256_storeu_si256(out + 19, _mm256_and_si256 ( mask, _mm256_srli_epi32( w0 , 1) ) ); + w1 = _mm256_lddqu_si256 (compressed + 17); + _mm256_storeu_si256(out + 20, + _mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w0 , 28) ,_mm256_slli_epi32( w1 , 4 ) ) ) ); + w0 = _mm256_lddqu_si256 (compressed + 18); + _mm256_storeu_si256(out + 21, + _mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w1 , 23) ,_mm256_slli_epi32( w0 , 9 ) ) ) ); + w1 = _mm256_lddqu_si256 (compressed + 19); + _mm256_storeu_si256(out + 22, + _mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w0 , 18) ,_mm256_slli_epi32( w1 , 14 ) ) ) ); + w0 = _mm256_lddqu_si256 (compressed + 20); + _mm256_storeu_si256(out + 23, + _mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w1 , 13) ,_mm256_slli_epi32( w0 , 19 ) ) ) ); + w1 = _mm256_lddqu_si256 (compressed + 21); + _mm256_storeu_si256(out + 24, + _mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w0 , 8) ,_mm256_slli_epi32( w1 , 24 ) ) ) ); + _mm256_storeu_si256(out + 25, _mm256_and_si256 ( mask, _mm256_srli_epi32( w1 , 3) ) ); + w0 = _mm256_lddqu_si256 (compressed + 22); + _mm256_storeu_si256(out + 26, + _mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w1 , 30) ,_mm256_slli_epi32( w0 , 2 ) ) ) ); + w1 = _mm256_lddqu_si256 (compressed + 23); + _mm256_storeu_si256(out + 27, + _mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w0 , 25) ,_mm256_slli_epi32( w1 , 7 ) ) ) ); + w0 = _mm256_lddqu_si256 (compressed + 24); + _mm256_storeu_si256(out + 28, + _mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w1 , 20) ,_mm256_slli_epi32( w0 , 12 ) ) ) ); + w1 = _mm256_lddqu_si256 (compressed + 25); + _mm256_storeu_si256(out + 29, + _mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w0 , 15) ,_mm256_slli_epi32( w1 , 17 ) ) ) ); + w0 = _mm256_lddqu_si256 (compressed + 26); + _mm256_storeu_si256(out + 30, + _mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w1 , 10) ,_mm256_slli_epi32( w0 , 22 ) ) ) ); + _mm256_storeu_si256(out + 31, _mm256_srli_epi32( w0 , 5) ); +} + + +/* we packed 256 28-bit values, touching 28 256-bit words, using 448 bytes */ +static void avxunpackblock28(const __m256i * compressed, uint32_t * pout) { + /* we are going to access 28 256-bit words */ + __m256i w0, w1; + __m256i * out = (__m256i *) pout; + const __m256i mask = _mm256_set1_epi32(268435455); + w0 = _mm256_lddqu_si256 (compressed); + _mm256_storeu_si256(out + 0, _mm256_and_si256 ( mask, w0 ) ); + w1 = _mm256_lddqu_si256 (compressed + 1); + _mm256_storeu_si256(out + 1, + _mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w0 , 28) ,_mm256_slli_epi32( w1 , 4 ) ) ) ); + w0 = _mm256_lddqu_si256 (compressed + 2); + _mm256_storeu_si256(out + 2, + _mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w1 , 24) ,_mm256_slli_epi32( w0 , 8 ) ) ) ); + w1 = _mm256_lddqu_si256 (compressed + 3); + _mm256_storeu_si256(out + 3, + _mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w0 , 20) ,_mm256_slli_epi32( w1 , 12 ) ) ) ); + w0 = _mm256_lddqu_si256 (compressed + 4); + _mm256_storeu_si256(out + 4, + _mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w1 , 16) ,_mm256_slli_epi32( w0 , 16 ) ) ) ); + w1 = _mm256_lddqu_si256 (compressed + 5); + _mm256_storeu_si256(out + 5, + _mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w0 , 12) ,_mm256_slli_epi32( w1 , 20 ) ) ) ); + w0 = _mm256_lddqu_si256 (compressed + 6); + _mm256_storeu_si256(out + 6, + _mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w1 , 8) ,_mm256_slli_epi32( w0 , 24 ) ) ) ); + _mm256_storeu_si256(out + 7, _mm256_srli_epi32( w0 , 4) ); + w1 = _mm256_lddqu_si256 (compressed + 7); + _mm256_storeu_si256(out + 8, _mm256_and_si256 ( mask, w1 ) ); + w0 = _mm256_lddqu_si256 (compressed + 8); + _mm256_storeu_si256(out + 9, + _mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w1 , 28) ,_mm256_slli_epi32( w0 , 4 ) ) ) ); + w1 = _mm256_lddqu_si256 (compressed + 9); + _mm256_storeu_si256(out + 10, + _mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w0 , 24) ,_mm256_slli_epi32( w1 , 8 ) ) ) ); + w0 = _mm256_lddqu_si256 (compressed + 10); + _mm256_storeu_si256(out + 11, + _mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w1 , 20) ,_mm256_slli_epi32( w0 , 12 ) ) ) ); + w1 = _mm256_lddqu_si256 (compressed + 11); + _mm256_storeu_si256(out + 12, + _mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w0 , 16) ,_mm256_slli_epi32( w1 , 16 ) ) ) ); + w0 = _mm256_lddqu_si256 (compressed + 12); + _mm256_storeu_si256(out + 13, + _mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w1 , 12) ,_mm256_slli_epi32( w0 , 20 ) ) ) ); + w1 = _mm256_lddqu_si256 (compressed + 13); + _mm256_storeu_si256(out + 14, + _mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w0 , 8) ,_mm256_slli_epi32( w1 , 24 ) ) ) ); + _mm256_storeu_si256(out + 15, _mm256_srli_epi32( w1 , 4) ); + w0 = _mm256_lddqu_si256 (compressed + 14); + _mm256_storeu_si256(out + 16, _mm256_and_si256 ( mask, w0 ) ); + w1 = _mm256_lddqu_si256 (compressed + 15); + _mm256_storeu_si256(out + 17, + _mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w0 , 28) ,_mm256_slli_epi32( w1 , 4 ) ) ) ); + w0 = _mm256_lddqu_si256 (compressed + 16); + _mm256_storeu_si256(out + 18, + _mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w1 , 24) ,_mm256_slli_epi32( w0 , 8 ) ) ) ); + w1 = _mm256_lddqu_si256 (compressed + 17); + _mm256_storeu_si256(out + 19, + _mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w0 , 20) ,_mm256_slli_epi32( w1 , 12 ) ) ) ); + w0 = _mm256_lddqu_si256 (compressed + 18); + _mm256_storeu_si256(out + 20, + _mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w1 , 16) ,_mm256_slli_epi32( w0 , 16 ) ) ) ); + w1 = _mm256_lddqu_si256 (compressed + 19); + _mm256_storeu_si256(out + 21, + _mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w0 , 12) ,_mm256_slli_epi32( w1 , 20 ) ) ) ); + w0 = _mm256_lddqu_si256 (compressed + 20); + _mm256_storeu_si256(out + 22, + _mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w1 , 8) ,_mm256_slli_epi32( w0 , 24 ) ) ) ); + _mm256_storeu_si256(out + 23, _mm256_srli_epi32( w0 , 4) ); + w1 = _mm256_lddqu_si256 (compressed + 21); + _mm256_storeu_si256(out + 24, _mm256_and_si256 ( mask, w1 ) ); + w0 = _mm256_lddqu_si256 (compressed + 22); + _mm256_storeu_si256(out + 25, + _mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w1 , 28) ,_mm256_slli_epi32( w0 , 4 ) ) ) ); + w1 = _mm256_lddqu_si256 (compressed + 23); + _mm256_storeu_si256(out + 26, + _mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w0 , 24) ,_mm256_slli_epi32( w1 , 8 ) ) ) ); + w0 = _mm256_lddqu_si256 (compressed + 24); + _mm256_storeu_si256(out + 27, + _mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w1 , 20) ,_mm256_slli_epi32( w0 , 12 ) ) ) ); + w1 = _mm256_lddqu_si256 (compressed + 25); + _mm256_storeu_si256(out + 28, + _mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w0 , 16) ,_mm256_slli_epi32( w1 , 16 ) ) ) ); + w0 = _mm256_lddqu_si256 (compressed + 26); + _mm256_storeu_si256(out + 29, + _mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w1 , 12) ,_mm256_slli_epi32( w0 , 20 ) ) ) ); + w1 = _mm256_lddqu_si256 (compressed + 27); + _mm256_storeu_si256(out + 30, + _mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w0 , 8) ,_mm256_slli_epi32( w1 , 24 ) ) ) ); + _mm256_storeu_si256(out + 31, _mm256_srli_epi32( w1 , 4) ); +} + + +/* we packed 256 29-bit values, touching 29 256-bit words, using 464 bytes */ +static void avxunpackblock29(const __m256i * compressed, uint32_t * pout) { + /* we are going to access 29 256-bit words */ + __m256i w0, w1; + __m256i * out = (__m256i *) pout; + const __m256i mask = _mm256_set1_epi32(536870911); + w0 = _mm256_lddqu_si256 (compressed); + _mm256_storeu_si256(out + 0, _mm256_and_si256 ( mask, w0 ) ); + w1 = _mm256_lddqu_si256 (compressed + 1); + _mm256_storeu_si256(out + 1, + _mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w0 , 29) ,_mm256_slli_epi32( w1 , 3 ) ) ) ); + w0 = _mm256_lddqu_si256 (compressed + 2); + _mm256_storeu_si256(out + 2, + _mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w1 , 26) ,_mm256_slli_epi32( w0 , 6 ) ) ) ); + w1 = _mm256_lddqu_si256 (compressed + 3); + _mm256_storeu_si256(out + 3, + _mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w0 , 23) ,_mm256_slli_epi32( w1 , 9 ) ) ) ); + w0 = _mm256_lddqu_si256 (compressed + 4); + _mm256_storeu_si256(out + 4, + _mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w1 , 20) ,_mm256_slli_epi32( w0 , 12 ) ) ) ); + w1 = _mm256_lddqu_si256 (compressed + 5); + _mm256_storeu_si256(out + 5, + _mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w0 , 17) ,_mm256_slli_epi32( w1 , 15 ) ) ) ); + w0 = _mm256_lddqu_si256 (compressed + 6); + _mm256_storeu_si256(out + 6, + _mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w1 , 14) ,_mm256_slli_epi32( w0 , 18 ) ) ) ); + w1 = _mm256_lddqu_si256 (compressed + 7); + _mm256_storeu_si256(out + 7, + _mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w0 , 11) ,_mm256_slli_epi32( w1 , 21 ) ) ) ); + w0 = _mm256_lddqu_si256 (compressed + 8); + _mm256_storeu_si256(out + 8, + _mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w1 , 8) ,_mm256_slli_epi32( w0 , 24 ) ) ) ); + w1 = _mm256_lddqu_si256 (compressed + 9); + _mm256_storeu_si256(out + 9, + _mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w0 , 5) ,_mm256_slli_epi32( w1 , 27 ) ) ) ); + _mm256_storeu_si256(out + 10, _mm256_and_si256 ( mask, _mm256_srli_epi32( w1 , 2) ) ); + w0 = _mm256_lddqu_si256 (compressed + 10); + _mm256_storeu_si256(out + 11, + _mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w1 , 31) ,_mm256_slli_epi32( w0 , 1 ) ) ) ); + w1 = _mm256_lddqu_si256 (compressed + 11); + _mm256_storeu_si256(out + 12, + _mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w0 , 28) ,_mm256_slli_epi32( w1 , 4 ) ) ) ); + w0 = _mm256_lddqu_si256 (compressed + 12); + _mm256_storeu_si256(out + 13, + _mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w1 , 25) ,_mm256_slli_epi32( w0 , 7 ) ) ) ); + w1 = _mm256_lddqu_si256 (compressed + 13); + _mm256_storeu_si256(out + 14, + _mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w0 , 22) ,_mm256_slli_epi32( w1 , 10 ) ) ) ); + w0 = _mm256_lddqu_si256 (compressed + 14); + _mm256_storeu_si256(out + 15, + _mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w1 , 19) ,_mm256_slli_epi32( w0 , 13 ) ) ) ); + w1 = _mm256_lddqu_si256 (compressed + 15); + _mm256_storeu_si256(out + 16, + _mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w0 , 16) ,_mm256_slli_epi32( w1 , 16 ) ) ) ); + w0 = _mm256_lddqu_si256 (compressed + 16); + _mm256_storeu_si256(out + 17, + _mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w1 , 13) ,_mm256_slli_epi32( w0 , 19 ) ) ) ); + w1 = _mm256_lddqu_si256 (compressed + 17); + _mm256_storeu_si256(out + 18, + _mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w0 , 10) ,_mm256_slli_epi32( w1 , 22 ) ) ) ); + w0 = _mm256_lddqu_si256 (compressed + 18); + _mm256_storeu_si256(out + 19, + _mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w1 , 7) ,_mm256_slli_epi32( w0 , 25 ) ) ) ); + w1 = _mm256_lddqu_si256 (compressed + 19); + _mm256_storeu_si256(out + 20, + _mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w0 , 4) ,_mm256_slli_epi32( w1 , 28 ) ) ) ); + _mm256_storeu_si256(out + 21, _mm256_and_si256 ( mask, _mm256_srli_epi32( w1 , 1) ) ); + w0 = _mm256_lddqu_si256 (compressed + 20); + _mm256_storeu_si256(out + 22, + _mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w1 , 30) ,_mm256_slli_epi32( w0 , 2 ) ) ) ); + w1 = _mm256_lddqu_si256 (compressed + 21); + _mm256_storeu_si256(out + 23, + _mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w0 , 27) ,_mm256_slli_epi32( w1 , 5 ) ) ) ); + w0 = _mm256_lddqu_si256 (compressed + 22); + _mm256_storeu_si256(out + 24, + _mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w1 , 24) ,_mm256_slli_epi32( w0 , 8 ) ) ) ); + w1 = _mm256_lddqu_si256 (compressed + 23); + _mm256_storeu_si256(out + 25, + _mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w0 , 21) ,_mm256_slli_epi32( w1 , 11 ) ) ) ); + w0 = _mm256_lddqu_si256 (compressed + 24); + _mm256_storeu_si256(out + 26, + _mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w1 , 18) ,_mm256_slli_epi32( w0 , 14 ) ) ) ); + w1 = _mm256_lddqu_si256 (compressed + 25); + _mm256_storeu_si256(out + 27, + _mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w0 , 15) ,_mm256_slli_epi32( w1 , 17 ) ) ) ); + w0 = _mm256_lddqu_si256 (compressed + 26); + _mm256_storeu_si256(out + 28, + _mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w1 , 12) ,_mm256_slli_epi32( w0 , 20 ) ) ) ); + w1 = _mm256_lddqu_si256 (compressed + 27); + _mm256_storeu_si256(out + 29, + _mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w0 , 9) ,_mm256_slli_epi32( w1 , 23 ) ) ) ); + w0 = _mm256_lddqu_si256 (compressed + 28); + _mm256_storeu_si256(out + 30, + _mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w1 , 6) ,_mm256_slli_epi32( w0 , 26 ) ) ) ); + _mm256_storeu_si256(out + 31, _mm256_srli_epi32( w0 , 3) ); +} + + +/* we packed 256 30-bit values, touching 30 256-bit words, using 480 bytes */ +static void avxunpackblock30(const __m256i * compressed, uint32_t * pout) { + /* we are going to access 30 256-bit words */ + __m256i w0, w1; + __m256i * out = (__m256i *) pout; + const __m256i mask = _mm256_set1_epi32(1073741823); + w0 = _mm256_lddqu_si256 (compressed); + _mm256_storeu_si256(out + 0, _mm256_and_si256 ( mask, w0 ) ); + w1 = _mm256_lddqu_si256 (compressed + 1); + _mm256_storeu_si256(out + 1, + _mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w0 , 30) ,_mm256_slli_epi32( w1 , 2 ) ) ) ); + w0 = _mm256_lddqu_si256 (compressed + 2); + _mm256_storeu_si256(out + 2, + _mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w1 , 28) ,_mm256_slli_epi32( w0 , 4 ) ) ) ); + w1 = _mm256_lddqu_si256 (compressed + 3); + _mm256_storeu_si256(out + 3, + _mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w0 , 26) ,_mm256_slli_epi32( w1 , 6 ) ) ) ); + w0 = _mm256_lddqu_si256 (compressed + 4); + _mm256_storeu_si256(out + 4, + _mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w1 , 24) ,_mm256_slli_epi32( w0 , 8 ) ) ) ); + w1 = _mm256_lddqu_si256 (compressed + 5); + _mm256_storeu_si256(out + 5, + _mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w0 , 22) ,_mm256_slli_epi32( w1 , 10 ) ) ) ); + w0 = _mm256_lddqu_si256 (compressed + 6); + _mm256_storeu_si256(out + 6, + _mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w1 , 20) ,_mm256_slli_epi32( w0 , 12 ) ) ) ); + w1 = _mm256_lddqu_si256 (compressed + 7); + _mm256_storeu_si256(out + 7, + _mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w0 , 18) ,_mm256_slli_epi32( w1 , 14 ) ) ) ); + w0 = _mm256_lddqu_si256 (compressed + 8); + _mm256_storeu_si256(out + 8, + _mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w1 , 16) ,_mm256_slli_epi32( w0 , 16 ) ) ) ); + w1 = _mm256_lddqu_si256 (compressed + 9); + _mm256_storeu_si256(out + 9, + _mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w0 , 14) ,_mm256_slli_epi32( w1 , 18 ) ) ) ); + w0 = _mm256_lddqu_si256 (compressed + 10); + _mm256_storeu_si256(out + 10, + _mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w1 , 12) ,_mm256_slli_epi32( w0 , 20 ) ) ) ); + w1 = _mm256_lddqu_si256 (compressed + 11); + _mm256_storeu_si256(out + 11, + _mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w0 , 10) ,_mm256_slli_epi32( w1 , 22 ) ) ) ); + w0 = _mm256_lddqu_si256 (compressed + 12); + _mm256_storeu_si256(out + 12, + _mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w1 , 8) ,_mm256_slli_epi32( w0 , 24 ) ) ) ); + w1 = _mm256_lddqu_si256 (compressed + 13); + _mm256_storeu_si256(out + 13, + _mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w0 , 6) ,_mm256_slli_epi32( w1 , 26 ) ) ) ); + w0 = _mm256_lddqu_si256 (compressed + 14); + _mm256_storeu_si256(out + 14, + _mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w1 , 4) ,_mm256_slli_epi32( w0 , 28 ) ) ) ); + _mm256_storeu_si256(out + 15, _mm256_srli_epi32( w0 , 2) ); + w1 = _mm256_lddqu_si256 (compressed + 15); + _mm256_storeu_si256(out + 16, _mm256_and_si256 ( mask, w1 ) ); + w0 = _mm256_lddqu_si256 (compressed + 16); + _mm256_storeu_si256(out + 17, + _mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w1 , 30) ,_mm256_slli_epi32( w0 , 2 ) ) ) ); + w1 = _mm256_lddqu_si256 (compressed + 17); + _mm256_storeu_si256(out + 18, + _mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w0 , 28) ,_mm256_slli_epi32( w1 , 4 ) ) ) ); + w0 = _mm256_lddqu_si256 (compressed + 18); + _mm256_storeu_si256(out + 19, + _mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w1 , 26) ,_mm256_slli_epi32( w0 , 6 ) ) ) ); + w1 = _mm256_lddqu_si256 (compressed + 19); + _mm256_storeu_si256(out + 20, + _mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w0 , 24) ,_mm256_slli_epi32( w1 , 8 ) ) ) ); + w0 = _mm256_lddqu_si256 (compressed + 20); + _mm256_storeu_si256(out + 21, + _mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w1 , 22) ,_mm256_slli_epi32( w0 , 10 ) ) ) ); + w1 = _mm256_lddqu_si256 (compressed + 21); + _mm256_storeu_si256(out + 22, + _mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w0 , 20) ,_mm256_slli_epi32( w1 , 12 ) ) ) ); + w0 = _mm256_lddqu_si256 (compressed + 22); + _mm256_storeu_si256(out + 23, + _mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w1 , 18) ,_mm256_slli_epi32( w0 , 14 ) ) ) ); + w1 = _mm256_lddqu_si256 (compressed + 23); + _mm256_storeu_si256(out + 24, + _mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w0 , 16) ,_mm256_slli_epi32( w1 , 16 ) ) ) ); + w0 = _mm256_lddqu_si256 (compressed + 24); + _mm256_storeu_si256(out + 25, + _mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w1 , 14) ,_mm256_slli_epi32( w0 , 18 ) ) ) ); + w1 = _mm256_lddqu_si256 (compressed + 25); + _mm256_storeu_si256(out + 26, + _mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w0 , 12) ,_mm256_slli_epi32( w1 , 20 ) ) ) ); + w0 = _mm256_lddqu_si256 (compressed + 26); + _mm256_storeu_si256(out + 27, + _mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w1 , 10) ,_mm256_slli_epi32( w0 , 22 ) ) ) ); + w1 = _mm256_lddqu_si256 (compressed + 27); + _mm256_storeu_si256(out + 28, + _mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w0 , 8) ,_mm256_slli_epi32( w1 , 24 ) ) ) ); + w0 = _mm256_lddqu_si256 (compressed + 28); + _mm256_storeu_si256(out + 29, + _mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w1 , 6) ,_mm256_slli_epi32( w0 , 26 ) ) ) ); + w1 = _mm256_lddqu_si256 (compressed + 29); + _mm256_storeu_si256(out + 30, + _mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w0 , 4) ,_mm256_slli_epi32( w1 , 28 ) ) ) ); + _mm256_storeu_si256(out + 31, _mm256_srli_epi32( w1 , 2) ); +} + + +/* we packed 256 31-bit values, touching 31 256-bit words, using 496 bytes */ +static void avxunpackblock31(const __m256i * compressed, uint32_t * pout) { + /* we are going to access 31 256-bit words */ + __m256i w0, w1; + __m256i * out = (__m256i *) pout; + const __m256i mask = _mm256_set1_epi32(2147483647); + w0 = _mm256_lddqu_si256 (compressed); + _mm256_storeu_si256(out + 0, _mm256_and_si256 ( mask, w0 ) ); + w1 = _mm256_lddqu_si256 (compressed + 1); + _mm256_storeu_si256(out + 1, + _mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w0 , 31) ,_mm256_slli_epi32( w1 , 1 ) ) ) ); + w0 = _mm256_lddqu_si256 (compressed + 2); + _mm256_storeu_si256(out + 2, + _mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w1 , 30) ,_mm256_slli_epi32( w0 , 2 ) ) ) ); + w1 = _mm256_lddqu_si256 (compressed + 3); + _mm256_storeu_si256(out + 3, + _mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w0 , 29) ,_mm256_slli_epi32( w1 , 3 ) ) ) ); + w0 = _mm256_lddqu_si256 (compressed + 4); + _mm256_storeu_si256(out + 4, + _mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w1 , 28) ,_mm256_slli_epi32( w0 , 4 ) ) ) ); + w1 = _mm256_lddqu_si256 (compressed + 5); + _mm256_storeu_si256(out + 5, + _mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w0 , 27) ,_mm256_slli_epi32( w1 , 5 ) ) ) ); + w0 = _mm256_lddqu_si256 (compressed + 6); + _mm256_storeu_si256(out + 6, + _mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w1 , 26) ,_mm256_slli_epi32( w0 , 6 ) ) ) ); + w1 = _mm256_lddqu_si256 (compressed + 7); + _mm256_storeu_si256(out + 7, + _mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w0 , 25) ,_mm256_slli_epi32( w1 , 7 ) ) ) ); + w0 = _mm256_lddqu_si256 (compressed + 8); + _mm256_storeu_si256(out + 8, + _mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w1 , 24) ,_mm256_slli_epi32( w0 , 8 ) ) ) ); + w1 = _mm256_lddqu_si256 (compressed + 9); + _mm256_storeu_si256(out + 9, + _mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w0 , 23) ,_mm256_slli_epi32( w1 , 9 ) ) ) ); + w0 = _mm256_lddqu_si256 (compressed + 10); + _mm256_storeu_si256(out + 10, + _mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w1 , 22) ,_mm256_slli_epi32( w0 , 10 ) ) ) ); + w1 = _mm256_lddqu_si256 (compressed + 11); + _mm256_storeu_si256(out + 11, + _mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w0 , 21) ,_mm256_slli_epi32( w1 , 11 ) ) ) ); + w0 = _mm256_lddqu_si256 (compressed + 12); + _mm256_storeu_si256(out + 12, + _mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w1 , 20) ,_mm256_slli_epi32( w0 , 12 ) ) ) ); + w1 = _mm256_lddqu_si256 (compressed + 13); + _mm256_storeu_si256(out + 13, + _mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w0 , 19) ,_mm256_slli_epi32( w1 , 13 ) ) ) ); + w0 = _mm256_lddqu_si256 (compressed + 14); + _mm256_storeu_si256(out + 14, + _mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w1 , 18) ,_mm256_slli_epi32( w0 , 14 ) ) ) ); + w1 = _mm256_lddqu_si256 (compressed + 15); + _mm256_storeu_si256(out + 15, + _mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w0 , 17) ,_mm256_slli_epi32( w1 , 15 ) ) ) ); + w0 = _mm256_lddqu_si256 (compressed + 16); + _mm256_storeu_si256(out + 16, + _mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w1 , 16) ,_mm256_slli_epi32( w0 , 16 ) ) ) ); + w1 = _mm256_lddqu_si256 (compressed + 17); + _mm256_storeu_si256(out + 17, + _mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w0 , 15) ,_mm256_slli_epi32( w1 , 17 ) ) ) ); + w0 = _mm256_lddqu_si256 (compressed + 18); + _mm256_storeu_si256(out + 18, + _mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w1 , 14) ,_mm256_slli_epi32( w0 , 18 ) ) ) ); + w1 = _mm256_lddqu_si256 (compressed + 19); + _mm256_storeu_si256(out + 19, + _mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w0 , 13) ,_mm256_slli_epi32( w1 , 19 ) ) ) ); + w0 = _mm256_lddqu_si256 (compressed + 20); + _mm256_storeu_si256(out + 20, + _mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w1 , 12) ,_mm256_slli_epi32( w0 , 20 ) ) ) ); + w1 = _mm256_lddqu_si256 (compressed + 21); + _mm256_storeu_si256(out + 21, + _mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w0 , 11) ,_mm256_slli_epi32( w1 , 21 ) ) ) ); + w0 = _mm256_lddqu_si256 (compressed + 22); + _mm256_storeu_si256(out + 22, + _mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w1 , 10) ,_mm256_slli_epi32( w0 , 22 ) ) ) ); + w1 = _mm256_lddqu_si256 (compressed + 23); + _mm256_storeu_si256(out + 23, + _mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w0 , 9) ,_mm256_slli_epi32( w1 , 23 ) ) ) ); + w0 = _mm256_lddqu_si256 (compressed + 24); + _mm256_storeu_si256(out + 24, + _mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w1 , 8) ,_mm256_slli_epi32( w0 , 24 ) ) ) ); + w1 = _mm256_lddqu_si256 (compressed + 25); + _mm256_storeu_si256(out + 25, + _mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w0 , 7) ,_mm256_slli_epi32( w1 , 25 ) ) ) ); + w0 = _mm256_lddqu_si256 (compressed + 26); + _mm256_storeu_si256(out + 26, + _mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w1 , 6) ,_mm256_slli_epi32( w0 , 26 ) ) ) ); + w1 = _mm256_lddqu_si256 (compressed + 27); + _mm256_storeu_si256(out + 27, + _mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w0 , 5) ,_mm256_slli_epi32( w1 , 27 ) ) ) ); + w0 = _mm256_lddqu_si256 (compressed + 28); + _mm256_storeu_si256(out + 28, + _mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w1 , 4) ,_mm256_slli_epi32( w0 , 28 ) ) ) ); + w1 = _mm256_lddqu_si256 (compressed + 29); + _mm256_storeu_si256(out + 29, + _mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w0 , 3) ,_mm256_slli_epi32( w1 , 29 ) ) ) ); + w0 = _mm256_lddqu_si256 (compressed + 30); + _mm256_storeu_si256(out + 30, + _mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w1 , 2) ,_mm256_slli_epi32( w0 , 30 ) ) ) ); + _mm256_storeu_si256(out + 31, _mm256_srli_epi32( w0 , 1) ); +} + + +/* we packed 256 32-bit values, touching 32 256-bit words, using 512 bytes */ +static void avxunpackblock32(const __m256i * compressed, uint32_t * pout) { + /* we are going to access 32 256-bit words */ + __m256i w0, w1; + __m256i * out = (__m256i *) pout; + w0 = _mm256_lddqu_si256 (compressed); + _mm256_storeu_si256(out + 0, w0 ); + w1 = _mm256_lddqu_si256 (compressed + 1); + _mm256_storeu_si256(out + 1, w1 ); + w0 = _mm256_lddqu_si256 (compressed + 2); + _mm256_storeu_si256(out + 2, w0 ); + w1 = _mm256_lddqu_si256 (compressed + 3); + _mm256_storeu_si256(out + 3, w1 ); + w0 = _mm256_lddqu_si256 (compressed + 4); + _mm256_storeu_si256(out + 4, w0 ); + w1 = _mm256_lddqu_si256 (compressed + 5); + _mm256_storeu_si256(out + 5, w1 ); + w0 = _mm256_lddqu_si256 (compressed + 6); + _mm256_storeu_si256(out + 6, w0 ); + w1 = _mm256_lddqu_si256 (compressed + 7); + _mm256_storeu_si256(out + 7, w1 ); + w0 = _mm256_lddqu_si256 (compressed + 8); + _mm256_storeu_si256(out + 8, w0 ); + w1 = _mm256_lddqu_si256 (compressed + 9); + _mm256_storeu_si256(out + 9, w1 ); + w0 = _mm256_lddqu_si256 (compressed + 10); + _mm256_storeu_si256(out + 10, w0 ); + w1 = _mm256_lddqu_si256 (compressed + 11); + _mm256_storeu_si256(out + 11, w1 ); + w0 = _mm256_lddqu_si256 (compressed + 12); + _mm256_storeu_si256(out + 12, w0 ); + w1 = _mm256_lddqu_si256 (compressed + 13); + _mm256_storeu_si256(out + 13, w1 ); + w0 = _mm256_lddqu_si256 (compressed + 14); + _mm256_storeu_si256(out + 14, w0 ); + w1 = _mm256_lddqu_si256 (compressed + 15); + _mm256_storeu_si256(out + 15, w1 ); + w0 = _mm256_lddqu_si256 (compressed + 16); + _mm256_storeu_si256(out + 16, w0 ); + w1 = _mm256_lddqu_si256 (compressed + 17); + _mm256_storeu_si256(out + 17, w1 ); + w0 = _mm256_lddqu_si256 (compressed + 18); + _mm256_storeu_si256(out + 18, w0 ); + w1 = _mm256_lddqu_si256 (compressed + 19); + _mm256_storeu_si256(out + 19, w1 ); + w0 = _mm256_lddqu_si256 (compressed + 20); + _mm256_storeu_si256(out + 20, w0 ); + w1 = _mm256_lddqu_si256 (compressed + 21); + _mm256_storeu_si256(out + 21, w1 ); + w0 = _mm256_lddqu_si256 (compressed + 22); + _mm256_storeu_si256(out + 22, w0 ); + w1 = _mm256_lddqu_si256 (compressed + 23); + _mm256_storeu_si256(out + 23, w1 ); + w0 = _mm256_lddqu_si256 (compressed + 24); + _mm256_storeu_si256(out + 24, w0 ); + w1 = _mm256_lddqu_si256 (compressed + 25); + _mm256_storeu_si256(out + 25, w1 ); + w0 = _mm256_lddqu_si256 (compressed + 26); + _mm256_storeu_si256(out + 26, w0 ); + w1 = _mm256_lddqu_si256 (compressed + 27); + _mm256_storeu_si256(out + 27, w1 ); + w0 = _mm256_lddqu_si256 (compressed + 28); + _mm256_storeu_si256(out + 28, w0 ); + w1 = _mm256_lddqu_si256 (compressed + 29); + _mm256_storeu_si256(out + 29, w1 ); + w0 = _mm256_lddqu_si256 (compressed + 30); + _mm256_storeu_si256(out + 30, w0 ); + w1 = _mm256_lddqu_si256 (compressed + 31); + _mm256_storeu_si256(out + 31, w1 ); +} + +static avxpackblockfnc avxfuncPackArr[] = { +&avxpackblock0, +&avxpackblock1, +&avxpackblock2, +&avxpackblock3, +&avxpackblock4, +&avxpackblock5, +&avxpackblock6, +&avxpackblock7, +&avxpackblock8, +&avxpackblock9, +&avxpackblock10, +&avxpackblock11, +&avxpackblock12, +&avxpackblock13, +&avxpackblock14, +&avxpackblock15, +&avxpackblock16, +&avxpackblock17, +&avxpackblock18, +&avxpackblock19, +&avxpackblock20, +&avxpackblock21, +&avxpackblock22, +&avxpackblock23, +&avxpackblock24, +&avxpackblock25, +&avxpackblock26, +&avxpackblock27, +&avxpackblock28, +&avxpackblock29, +&avxpackblock30, +&avxpackblock31, +&avxpackblock32 +}; +static avxpackblockfnc avxfuncPackMaskArr[] = { +&avxpackblockmask0, +&avxpackblockmask1, +&avxpackblockmask2, +&avxpackblockmask3, +&avxpackblockmask4, +&avxpackblockmask5, +&avxpackblockmask6, +&avxpackblockmask7, +&avxpackblockmask8, +&avxpackblockmask9, +&avxpackblockmask10, +&avxpackblockmask11, +&avxpackblockmask12, +&avxpackblockmask13, +&avxpackblockmask14, +&avxpackblockmask15, +&avxpackblockmask16, +&avxpackblockmask17, +&avxpackblockmask18, +&avxpackblockmask19, +&avxpackblockmask20, +&avxpackblockmask21, +&avxpackblockmask22, +&avxpackblockmask23, +&avxpackblockmask24, +&avxpackblockmask25, +&avxpackblockmask26, +&avxpackblockmask27, +&avxpackblockmask28, +&avxpackblockmask29, +&avxpackblockmask30, +&avxpackblockmask31, +&avxpackblockmask32 +}; +static avxunpackblockfnc avxfuncUnpackArr[] = { +&avxunpackblock0, +&avxunpackblock1, +&avxunpackblock2, +&avxunpackblock3, +&avxunpackblock4, +&avxunpackblock5, +&avxunpackblock6, +&avxunpackblock7, +&avxunpackblock8, +&avxunpackblock9, +&avxunpackblock10, +&avxunpackblock11, +&avxunpackblock12, +&avxunpackblock13, +&avxunpackblock14, +&avxunpackblock15, +&avxunpackblock16, +&avxunpackblock17, +&avxunpackblock18, +&avxunpackblock19, +&avxunpackblock20, +&avxunpackblock21, +&avxunpackblock22, +&avxunpackblock23, +&avxunpackblock24, +&avxunpackblock25, +&avxunpackblock26, +&avxunpackblock27, +&avxunpackblock28, +&avxunpackblock29, +&avxunpackblock30, +&avxunpackblock31, +&avxunpackblock32 +}; +/** code generated by avxpacking.py ends here **/ + + + + + + + + + +/* reads 256 values from "in", writes "bit" 256-bit vectors to "out" */ +void avxpack(const uint32_t * in,__m256i * out, const uint32_t bit) { + avxfuncPackMaskArr[bit](in,out); +} + +/* reads 256 values from "in", writes "bit" 256-bit vectors to "out" */ +void avxpackwithoutmask(const uint32_t * in,__m256i * out, const uint32_t bit) { + avxfuncPackArr[bit](in,out); +} + +/* reads "bit" 256-bit vectors from "in", writes 256 values to "out" */ +void avxunpack(const __m256i * in,uint32_t * out, const uint32_t bit) { + avxfuncUnpackArr[bit](in,out); +} + +#endif /* __AVX2__ */ diff --git a/src/simdbitpacking.c b/src/simdbitpacking.c new file mode 100644 index 000000000..d249091e9 --- /dev/null +++ b/src/simdbitpacking.c @@ -0,0 +1,14183 @@ +/** + * This code is released under a BSD License. + */ +#include "simdbitpacking.h" + + +static void SIMD_nullunpacker32(const __m128i * _in , uint32_t * out) { + (void) _in; + memset(out,0,32 * 4 * 4); +} + +static void __SIMD_fastpackwithoutmask1_32(const uint32_t * _in, __m128i * out) { + const __m128i *in = (const __m128i*)(_in); + __m128i OutReg; + __m128i InReg = _mm_loadu_si128(in); + + OutReg = InReg; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 1)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 3)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 5)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 7)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 9)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 11)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 13)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 15)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 17)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 19)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 21)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 23)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 25)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 27)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 29)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 31)); + _mm_storeu_si128(out, OutReg); + + +} + + + +static void __SIMD_fastpackwithoutmask2_32(const uint32_t * _in, __m128i * out) { + const __m128i *in = (const __m128i*)(_in); + __m128i OutReg; + __m128i InReg = _mm_loadu_si128(in); + + OutReg = InReg; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_loadu_si128(++in); + + OutReg = InReg; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + + +} + + + +static void __SIMD_fastpackwithoutmask3_32(const uint32_t * _in, __m128i * out) { + const __m128i *in = (const __m128i*)(_in); + __m128i OutReg; + __m128i InReg = _mm_loadu_si128(in); + + OutReg = InReg; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 3)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 9)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 15)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 21)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 27)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 3 - 1); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 1)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 7)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 13)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 19)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 25)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 31)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 3 - 2); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 5)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 11)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 17)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 23)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 29)); + _mm_storeu_si128(out, OutReg); + + +} + + + +static void __SIMD_fastpackwithoutmask5_32(const uint32_t * _in, __m128i * out) { + const __m128i *in = (const __m128i*)(_in); + __m128i OutReg; + __m128i InReg = _mm_loadu_si128(in); + + OutReg = InReg; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 5)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 15)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 25)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 5 - 3); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 3)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 13)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 23)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 5 - 1); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 1)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 11)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 21)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 31)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 5 - 4); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 9)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 19)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 29)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 5 - 2); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 7)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 17)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 27)); + _mm_storeu_si128(out, OutReg); + + +} + + + +static void __SIMD_fastpackwithoutmask6_32(const uint32_t * _in, __m128i * out) { + const __m128i *in = (const __m128i*)(_in); + __m128i OutReg; + __m128i InReg = _mm_loadu_si128(in); + + OutReg = InReg; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 6 - 4); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 6 - 2); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_loadu_si128(++in); + + OutReg = InReg; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 6 - 4); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 6 - 2); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + _mm_storeu_si128(out, OutReg); + + +} + + + +static void __SIMD_fastpackwithoutmask7_32(const uint32_t * _in, __m128i * out) { + const __m128i *in = (const __m128i*)(_in); + __m128i OutReg; + __m128i InReg = _mm_loadu_si128(in); + + OutReg = InReg; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 7)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 21)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 7 - 3); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 3)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 17)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 31)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 7 - 6); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 13)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 27)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 7 - 2); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 9)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 23)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 7 - 5); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 5)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 19)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 7 - 1); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 1)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 15)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 29)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 7 - 4); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 11)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 25)); + _mm_storeu_si128(out, OutReg); + + +} + + + +static void __SIMD_fastpackwithoutmask9_32(const uint32_t * _in, __m128i * out) { + const __m128i *in = (const __m128i*)(_in); + __m128i OutReg; + __m128i InReg = _mm_loadu_si128(in); + + OutReg = InReg; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 9)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 27)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 9 - 4); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 13)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 31)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 9 - 8); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 17)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 9 - 3); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 3)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 21)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 9 - 7); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 7)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 25)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 9 - 2); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 11)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 29)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 9 - 6); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 15)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 9 - 1); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 1)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 19)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 9 - 5); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 5)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 23)); + _mm_storeu_si128(out, OutReg); + + +} + + + +static void __SIMD_fastpackwithoutmask10_32(const uint32_t * _in, __m128i * out) { + const __m128i *in = (const __m128i*)(_in); + __m128i OutReg; + __m128i InReg = _mm_loadu_si128(in); + + OutReg = InReg; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 10 - 8); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 10 - 6); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 10 - 4); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 10 - 2); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_loadu_si128(++in); + + OutReg = InReg; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 10 - 8); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 10 - 6); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 10 - 4); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 10 - 2); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + _mm_storeu_si128(out, OutReg); + + +} + + + +static void __SIMD_fastpackwithoutmask11_32(const uint32_t * _in, __m128i * out) { + const __m128i *in = (const __m128i*)(_in); + __m128i OutReg; + __m128i InReg = _mm_loadu_si128(in); + + OutReg = InReg; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 11)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 11 - 1); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 1)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 23)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 11 - 2); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 13)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 11 - 3); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 3)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 25)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 11 - 4); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 15)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 11 - 5); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 5)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 27)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 11 - 6); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 17)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 11 - 7); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 7)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 29)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 11 - 8); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 19)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 11 - 9); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 9)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 31)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 11 - 10); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 21)); + _mm_storeu_si128(out, OutReg); + + +} + + + +static void __SIMD_fastpackwithoutmask12_32(const uint32_t * _in, __m128i * out) { + const __m128i *in = (const __m128i*)(_in); + __m128i OutReg; + __m128i InReg = _mm_loadu_si128(in); + + OutReg = InReg; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 12 - 4); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 12 - 8); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_loadu_si128(++in); + + OutReg = InReg; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 12 - 4); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 12 - 8); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_loadu_si128(++in); + + OutReg = InReg; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 12 - 4); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 12 - 8); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_loadu_si128(++in); + + OutReg = InReg; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 12 - 4); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 12 - 8); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + + +} + + + +static void __SIMD_fastpackwithoutmask13_32(const uint32_t * _in, __m128i * out) { + const __m128i *in = (const __m128i*)(_in); + __m128i OutReg; + __m128i InReg = _mm_loadu_si128(in); + + OutReg = InReg; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 13)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 13 - 7); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 7)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 13 - 1); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 1)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 27)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 13 - 8); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 21)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 13 - 2); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 15)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 13 - 9); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 9)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 13 - 3); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 3)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 29)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 13 - 10); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 23)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 13 - 4); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 17)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 13 - 11); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 11)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 13 - 5); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 5)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 31)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 13 - 12); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 25)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 13 - 6); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 19)); + _mm_storeu_si128(out, OutReg); + + +} + + + +static void __SIMD_fastpackwithoutmask14_32(const uint32_t * _in, __m128i * out) { + const __m128i *in = (const __m128i*)(_in); + __m128i OutReg; + __m128i InReg = _mm_loadu_si128(in); + + OutReg = InReg; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 14 - 10); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 14 - 6); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 14 - 2); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 14 - 12); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 14 - 8); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 14 - 4); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_loadu_si128(++in); + + OutReg = InReg; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 14 - 10); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 14 - 6); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 14 - 2); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 14 - 12); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 14 - 8); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 14 - 4); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + _mm_storeu_si128(out, OutReg); + + +} + + + +static void __SIMD_fastpackwithoutmask15_32(const uint32_t * _in, __m128i * out) { + const __m128i *in = (const __m128i*)(_in); + __m128i OutReg; + __m128i InReg = _mm_loadu_si128(in); + + OutReg = InReg; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 15)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 15 - 13); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 13)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 15 - 11); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 11)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 15 - 9); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 9)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 15 - 7); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 7)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 15 - 5); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 5)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 15 - 3); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 3)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 15 - 1); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 1)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 31)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 15 - 14); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 29)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 15 - 12); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 27)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 15 - 10); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 25)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 15 - 8); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 23)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 15 - 6); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 21)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 15 - 4); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 19)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 15 - 2); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 17)); + _mm_storeu_si128(out, OutReg); + + +} + + + +static void __SIMD_fastpackwithoutmask17_32(const uint32_t * _in, __m128i * out) { + const __m128i *in = (const __m128i*)(_in); + __m128i OutReg; + __m128i InReg = _mm_loadu_si128(in); + + OutReg = InReg; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 17)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 17 - 2); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 19)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 17 - 4); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 21)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 17 - 6); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 23)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 17 - 8); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 25)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 17 - 10); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 27)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 17 - 12); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 29)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 17 - 14); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 31)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 17 - 16); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 17 - 1); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 1)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 17 - 3); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 3)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 17 - 5); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 5)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 17 - 7); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 7)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 17 - 9); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 9)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 17 - 11); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 11)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 17 - 13); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 13)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 17 - 15); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 15)); + _mm_storeu_si128(out, OutReg); + + +} + + + +static void __SIMD_fastpackwithoutmask18_32(const uint32_t * _in, __m128i * out) { + const __m128i *in = (const __m128i*)(_in); + __m128i OutReg; + __m128i InReg = _mm_loadu_si128(in); + + OutReg = InReg; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 18 - 4); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 18 - 8); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 18 - 12); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 18 - 16); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 18 - 2); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 18 - 6); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 18 - 10); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 18 - 14); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_loadu_si128(++in); + + OutReg = InReg; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 18 - 4); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 18 - 8); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 18 - 12); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 18 - 16); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 18 - 2); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 18 - 6); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 18 - 10); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 18 - 14); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + _mm_storeu_si128(out, OutReg); + + +} + + + +static void __SIMD_fastpackwithoutmask19_32(const uint32_t * _in, __m128i * out) { + const __m128i *in = (const __m128i*)(_in); + __m128i OutReg; + __m128i InReg = _mm_loadu_si128(in); + + OutReg = InReg; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 19)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 19 - 6); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 25)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 19 - 12); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 31)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 19 - 18); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 19 - 5); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 5)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 19 - 11); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 11)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 19 - 17); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 17)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 19 - 4); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 23)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 19 - 10); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 29)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 19 - 16); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 19 - 3); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 3)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 19 - 9); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 9)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 19 - 15); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 15)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 19 - 2); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 21)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 19 - 8); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 27)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 19 - 14); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 19 - 1); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 1)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 19 - 7); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 7)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 19 - 13); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 13)); + _mm_storeu_si128(out, OutReg); + + +} + + + +static void __SIMD_fastpackwithoutmask20_32(const uint32_t * _in, __m128i * out) { + const __m128i *in = (const __m128i*)(_in); + __m128i OutReg; + __m128i InReg = _mm_loadu_si128(in); + + OutReg = InReg; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 20 - 8); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 20 - 16); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 20 - 4); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 20 - 12); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_loadu_si128(++in); + + OutReg = InReg; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 20 - 8); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 20 - 16); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 20 - 4); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 20 - 12); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_loadu_si128(++in); + + OutReg = InReg; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 20 - 8); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 20 - 16); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 20 - 4); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 20 - 12); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_loadu_si128(++in); + + OutReg = InReg; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 20 - 8); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 20 - 16); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 20 - 4); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 20 - 12); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + _mm_storeu_si128(out, OutReg); + + +} + + + +static void __SIMD_fastpackwithoutmask21_32(const uint32_t * _in, __m128i * out) { + const __m128i *in = (const __m128i*)(_in); + __m128i OutReg; + __m128i InReg = _mm_loadu_si128(in); + + OutReg = InReg; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 21)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 21 - 10); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 31)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 21 - 20); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 21 - 9); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 9)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 21 - 19); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 19)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 21 - 8); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 29)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 21 - 18); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 21 - 7); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 7)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 21 - 17); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 17)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 21 - 6); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 27)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 21 - 16); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 21 - 5); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 5)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 21 - 15); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 15)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 21 - 4); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 25)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 21 - 14); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 21 - 3); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 3)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 21 - 13); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 13)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 21 - 2); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 23)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 21 - 12); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 21 - 1); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 1)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 21 - 11); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 11)); + _mm_storeu_si128(out, OutReg); + + +} + + + +static void __SIMD_fastpackwithoutmask22_32(const uint32_t * _in, __m128i * out) { + const __m128i *in = (const __m128i*)(_in); + __m128i OutReg; + __m128i InReg = _mm_loadu_si128(in); + + OutReg = InReg; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 22 - 12); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 22 - 2); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 22 - 14); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 22 - 4); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 22 - 16); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 22 - 6); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 22 - 18); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 22 - 8); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 22 - 20); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 22 - 10); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_loadu_si128(++in); + + OutReg = InReg; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 22 - 12); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 22 - 2); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 22 - 14); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 22 - 4); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 22 - 16); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 22 - 6); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 22 - 18); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 22 - 8); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 22 - 20); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 22 - 10); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + _mm_storeu_si128(out, OutReg); + + +} + + + +static void __SIMD_fastpackwithoutmask23_32(const uint32_t * _in, __m128i * out) { + const __m128i *in = (const __m128i*)(_in); + __m128i OutReg; + __m128i InReg = _mm_loadu_si128(in); + + OutReg = InReg; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 23)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 14); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 5); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 5)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 19); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 19)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 10); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 1); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 1)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 15); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 15)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 6); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 29)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 20); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 11); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 11)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 2); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 25)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 16); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 7); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 7)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 21); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 21)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 12); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 3); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 3)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 17); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 17)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 8); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 31)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 22); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 13); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 13)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 4); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 27)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 18); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 9); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 9)); + _mm_storeu_si128(out, OutReg); + + +} + + + +static void __SIMD_fastpackwithoutmask24_32(const uint32_t * _in, __m128i * out) { + const __m128i *in = (const __m128i*)(_in); + __m128i OutReg; + __m128i InReg = _mm_loadu_si128(in); + + OutReg = InReg; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 24 - 16); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 24 - 8); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_loadu_si128(++in); + + OutReg = InReg; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 24 - 16); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 24 - 8); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_loadu_si128(++in); + + OutReg = InReg; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 24 - 16); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 24 - 8); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_loadu_si128(++in); + + OutReg = InReg; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 24 - 16); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 24 - 8); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_loadu_si128(++in); + + OutReg = InReg; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 24 - 16); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 24 - 8); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_loadu_si128(++in); + + OutReg = InReg; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 24 - 16); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 24 - 8); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_loadu_si128(++in); + + OutReg = InReg; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 24 - 16); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 24 - 8); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_loadu_si128(++in); + + OutReg = InReg; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 24 - 16); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 24 - 8); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + _mm_storeu_si128(out, OutReg); + + +} + + + +static void __SIMD_fastpackwithoutmask25_32(const uint32_t * _in, __m128i * out) { + const __m128i *in = (const __m128i*)(_in); + __m128i OutReg; + __m128i InReg = _mm_loadu_si128(in); + + OutReg = InReg; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 25)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 18); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 11); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 11)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 4); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 29)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 22); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 15); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 15)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 8); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 1); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 1)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 19); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 19)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 12); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 5); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 5)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 23); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 23)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 16); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 9); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 9)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 2); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 27)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 20); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 13); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 13)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 6); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 31)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 24); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 17); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 17)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 10); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 3); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 3)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 21); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 21)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 14); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 7); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 7)); + _mm_storeu_si128(out, OutReg); + + +} + + + +static void __SIMD_fastpackwithoutmask26_32(const uint32_t * _in, __m128i * out) { + const __m128i *in = (const __m128i*)(_in); + __m128i OutReg; + __m128i InReg = _mm_loadu_si128(in); + + OutReg = InReg; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 20); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 14); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 8); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 2); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 22); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 16); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 10); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 4); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 24); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 18); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 12); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 6); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_loadu_si128(++in); + + OutReg = InReg; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 20); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 14); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 8); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 2); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 22); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 16); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 10); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 4); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 24); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 18); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 12); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 6); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + _mm_storeu_si128(out, OutReg); + + +} + + + +static void __SIMD_fastpackwithoutmask27_32(const uint32_t * _in, __m128i * out) { + const __m128i *in = (const __m128i*)(_in); + __m128i OutReg; + __m128i InReg = _mm_loadu_si128(in); + + OutReg = InReg; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 27)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 22); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 17); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 17)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 12); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 7); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 7)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 2); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 29)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 24); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 19); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 19)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 14); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 9); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 9)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 4); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 31)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 26); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 21); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 21)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 16); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 11); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 11)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 6); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 1); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 1)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 23); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 23)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 18); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 13); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 13)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 8); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 3); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 3)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 25); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 25)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 20); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 15); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 15)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 10); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 5); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 5)); + _mm_storeu_si128(out, OutReg); + + +} + + + +static void __SIMD_fastpackwithoutmask28_32(const uint32_t * _in, __m128i * out) { + const __m128i *in = (const __m128i*)(_in); + __m128i OutReg; + __m128i InReg = _mm_loadu_si128(in); + + OutReg = InReg; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 24); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 20); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 16); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 12); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 8); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 4); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_loadu_si128(++in); + + OutReg = InReg; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 24); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 20); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 16); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 12); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 8); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 4); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_loadu_si128(++in); + + OutReg = InReg; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 24); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 20); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 16); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 12); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 8); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 4); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_loadu_si128(++in); + + OutReg = InReg; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 24); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 20); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 16); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 12); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 8); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 4); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + _mm_storeu_si128(out, OutReg); + + +} + + + +static void __SIMD_fastpackwithoutmask29_32(const uint32_t * _in, __m128i * out) { + const __m128i *in = (const __m128i*)(_in); + __m128i OutReg; + __m128i InReg = _mm_loadu_si128(in); + + OutReg = InReg; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 29)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 26); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 23); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 23)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 20); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 17); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 17)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 14); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 11); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 11)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 8); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 5); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 5)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 2); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 31)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 28); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 25); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 25)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 22); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 19); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 19)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 16); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 13); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 13)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 10); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 7); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 7)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 4); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 1); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 1)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 27); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 27)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 24); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 21); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 21)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 18); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 15); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 15)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 12); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 9); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 9)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 6); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 3); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 3)); + _mm_storeu_si128(out, OutReg); + + +} + + + +static void __SIMD_fastpackwithoutmask30_32(const uint32_t * _in, __m128i * out) { + const __m128i *in = (const __m128i*)(_in); + __m128i OutReg; + __m128i InReg = _mm_loadu_si128(in); + + OutReg = InReg; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 28); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 26); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 24); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 22); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 20); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 18); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 16); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 14); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 12); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 10); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 8); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 6); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 4); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 2); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_loadu_si128(++in); + + OutReg = InReg; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 28); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 26); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 24); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 22); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 20); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 18); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 16); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 14); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 12); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 10); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 8); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 6); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 4); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 2); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + _mm_storeu_si128(out, OutReg); + + +} + + + +static void __SIMD_fastpackwithoutmask31_32(const uint32_t * _in, __m128i * out) { + const __m128i *in = (const __m128i*)(_in); + __m128i OutReg; + __m128i InReg = _mm_loadu_si128(in); + + OutReg = InReg; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 31)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 30); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 29); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 29)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 28); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 27); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 27)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 26); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 25); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 25)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 24); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 23); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 23)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 22); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 21); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 21)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 20); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 19); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 19)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 18); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 17); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 17)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 16); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 15); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 15)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 14); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 13); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 13)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 12); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 11); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 11)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 10); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 9); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 9)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 8); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 7); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 7)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 6); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 5); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 5)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 4); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 3); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 3)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 2); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 1); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 1)); + _mm_storeu_si128(out, OutReg); + + +} + + + +static void __SIMD_fastpackwithoutmask32_32(const uint32_t * _in, __m128i * out) { + const __m128i *in = (const __m128i*)(_in); + __m128i OutReg; + __m128i InReg = _mm_loadu_si128(in); + + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_loadu_si128(++in); + + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_loadu_si128(++in); + + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_loadu_si128(++in); + + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_loadu_si128(++in); + + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_loadu_si128(++in); + + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_loadu_si128(++in); + + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_loadu_si128(++in); + + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_loadu_si128(++in); + + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_loadu_si128(++in); + + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_loadu_si128(++in); + + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_loadu_si128(++in); + + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_loadu_si128(++in); + + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_loadu_si128(++in); + + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_loadu_si128(++in); + + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_loadu_si128(++in); + + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_loadu_si128(++in); + + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_loadu_si128(++in); + + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_loadu_si128(++in); + + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_loadu_si128(++in); + + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_loadu_si128(++in); + + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_loadu_si128(++in); + + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_loadu_si128(++in); + + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_loadu_si128(++in); + + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_loadu_si128(++in); + + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_loadu_si128(++in); + + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_loadu_si128(++in); + + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_loadu_si128(++in); + + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_loadu_si128(++in); + + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_loadu_si128(++in); + + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_loadu_si128(++in); + + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_loadu_si128(++in); + + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + + +} + + + +static void __SIMD_fastpackwithoutmask4_32(const uint32_t * _in, __m128i * out) { + const __m128i *in = (const __m128i*)(_in); + __m128i OutReg; + __m128i InReg; + uint32_t outer; + + for(outer=0; outer< 4 ;++outer) { + InReg = _mm_loadu_si128(in); + OutReg = InReg; + + InReg = _mm_loadu_si128(in+1); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + + InReg = _mm_loadu_si128(in+2); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + + InReg = _mm_loadu_si128(in+3); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + + InReg = _mm_loadu_si128(in+4); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + + InReg = _mm_loadu_si128(in+5); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + + InReg = _mm_loadu_si128(in+6); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + + InReg = _mm_loadu_si128(in+7); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + ++out; + + in+=8; + } + +} + + + +static void __SIMD_fastpackwithoutmask8_32(const uint32_t * _in, __m128i * out) { + const __m128i *in = (const __m128i*)(_in); + __m128i OutReg; + __m128i InReg; + uint32_t outer; + + for(outer=0; outer< 8 ;++outer) { + InReg = _mm_loadu_si128(in); + OutReg = InReg; + + InReg = _mm_loadu_si128(in+1); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + + InReg = _mm_loadu_si128(in+2); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + + InReg = _mm_loadu_si128(in+3); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + ++out; + + in+=4; + } + +} + + + +static void __SIMD_fastpackwithoutmask16_32(const uint32_t * _in, __m128i * out) { + const __m128i *in = (const __m128i*)(_in); + __m128i OutReg; + __m128i InReg; + uint32_t outer; + + for(outer=0; outer< 16 ;++outer) { + InReg = _mm_loadu_si128(in); + OutReg = InReg; + + InReg = _mm_loadu_si128(in+1); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + ++out; + + in+=2; + } + +} + + + +static void __SIMD_fastpack1_32(const uint32_t * _in, __m128i * out) { + const __m128i *in = (const __m128i*)(_in); + __m128i OutReg; + + + const __m128i mask = _mm_set1_epi32((1U<<1)-1); + + __m128i InReg = _mm_and_si128(_mm_loadu_si128(in), mask); + OutReg = InReg; + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 1)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 2)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 3)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 5)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 6)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 7)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 9)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 10)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 11)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 13)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 14)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 15)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 17)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 18)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 19)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 21)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 22)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 23)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 25)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 26)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 27)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 29)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 30)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 31)); + _mm_storeu_si128(out, OutReg); + + +} + + + +static void __SIMD_fastpack2_32(const uint32_t * _in, __m128i * out) { + const __m128i *in = (const __m128i*)(_in); + __m128i OutReg; + + + const __m128i mask = _mm_set1_epi32((1U<<2)-1); + + __m128i InReg = _mm_and_si128(_mm_loadu_si128(in), mask); + OutReg = InReg; + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 2)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 6)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 10)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 14)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 18)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 22)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 26)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = InReg; + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 2)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 6)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 10)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 14)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 18)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 22)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 26)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + + +} + + + +static void __SIMD_fastpack3_32(const uint32_t * _in, __m128i * out) { + const __m128i *in = (const __m128i*)(_in); + __m128i OutReg; + + + const __m128i mask = _mm_set1_epi32((1U<<3)-1); + + __m128i InReg = _mm_and_si128(_mm_loadu_si128(in), mask); + OutReg = InReg; + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 3)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 6)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 9)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 15)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 18)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 21)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 27)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 3 - 1); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 1)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 7)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 10)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 13)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 19)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 22)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 25)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 31)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 3 - 2); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 2)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 5)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 11)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 14)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 17)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 23)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 26)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 29)); + _mm_storeu_si128(out, OutReg); + + +} + + + +static void __SIMD_fastpack5_32(const uint32_t * _in, __m128i * out) { + const __m128i *in = (const __m128i*)(_in); + __m128i OutReg; + + + const __m128i mask = _mm_set1_epi32((1U<<5)-1); + + __m128i InReg = _mm_and_si128(_mm_loadu_si128(in), mask); + OutReg = InReg; + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 5)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 10)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 15)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 25)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 5 - 3); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 3)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 13)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 18)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 23)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 5 - 1); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 1)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 6)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 11)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 21)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 26)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 31)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 5 - 4); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 9)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 14)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 19)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 29)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 5 - 2); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 2)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 7)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 17)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 22)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 27)); + _mm_storeu_si128(out, OutReg); + + +} + + + +static void __SIMD_fastpack6_32(const uint32_t * _in, __m128i * out) { + const __m128i *in = (const __m128i*)(_in); + __m128i OutReg; + + + const __m128i mask = _mm_set1_epi32((1U<<6)-1); + + __m128i InReg = _mm_and_si128(_mm_loadu_si128(in), mask); + OutReg = InReg; + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 6)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 18)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 6 - 4); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 10)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 22)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 6 - 2); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 2)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 14)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 26)); + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = InReg; + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 6)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 18)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 6 - 4); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 10)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 22)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 6 - 2); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 2)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 14)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 26)); + _mm_storeu_si128(out, OutReg); + + +} + + + +static void __SIMD_fastpack7_32(const uint32_t * _in, __m128i * out) { + const __m128i *in = (const __m128i*)(_in); + __m128i OutReg; + + + const __m128i mask = _mm_set1_epi32((1U<<7)-1); + + __m128i InReg = _mm_and_si128(_mm_loadu_si128(in), mask); + OutReg = InReg; + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 7)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 14)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 21)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 7 - 3); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 3)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 10)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 17)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 31)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 7 - 6); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 6)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 13)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 27)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 7 - 2); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 2)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 9)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 23)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 7 - 5); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 5)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 19)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 26)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 7 - 1); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 1)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 15)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 22)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 29)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 7 - 4); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 11)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 18)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 25)); + _mm_storeu_si128(out, OutReg); + + +} + + + +static void __SIMD_fastpack9_32(const uint32_t * _in, __m128i * out) { + const __m128i *in = (const __m128i*)(_in); + __m128i OutReg; + + + const __m128i mask = _mm_set1_epi32((1U<<9)-1); + + __m128i InReg = _mm_and_si128(_mm_loadu_si128(in), mask); + OutReg = InReg; + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 9)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 18)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 27)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 9 - 4); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 13)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 22)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 31)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 9 - 8); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 17)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 26)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 9 - 3); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 3)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 21)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 9 - 7); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 7)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 25)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 9 - 2); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 2)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 11)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 29)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 9 - 6); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 6)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 15)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 9 - 1); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 1)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 10)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 19)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 9 - 5); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 5)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 14)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 23)); + _mm_storeu_si128(out, OutReg); + + +} + + + +static void __SIMD_fastpack10_32(const uint32_t * _in, __m128i * out) { + const __m128i *in = (const __m128i*)(_in); + __m128i OutReg; + + + const __m128i mask = _mm_set1_epi32((1U<<10)-1); + + __m128i InReg = _mm_and_si128(_mm_loadu_si128(in), mask); + OutReg = InReg; + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 10)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 10 - 8); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 18)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 10 - 6); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 6)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 26)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 10 - 4); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 14)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 10 - 2); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 2)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 22)); + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = InReg; + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 10)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 10 - 8); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 18)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 10 - 6); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 6)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 26)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 10 - 4); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 14)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 10 - 2); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 2)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 22)); + _mm_storeu_si128(out, OutReg); + + +} + + + +static void __SIMD_fastpack11_32(const uint32_t * _in, __m128i * out) { + const __m128i *in = (const __m128i*)(_in); + __m128i OutReg; + + + const __m128i mask = _mm_set1_epi32((1U<<11)-1); + + __m128i InReg = _mm_and_si128(_mm_loadu_si128(in), mask); + OutReg = InReg; + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 11)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 22)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 11 - 1); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 1)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 23)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 11 - 2); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 2)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 13)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 11 - 3); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 3)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 14)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 25)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 11 - 4); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 15)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 26)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 11 - 5); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 5)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 27)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 11 - 6); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 6)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 17)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 11 - 7); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 7)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 18)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 29)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 11 - 8); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 19)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 11 - 9); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 9)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 31)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 11 - 10); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 10)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 21)); + _mm_storeu_si128(out, OutReg); + + +} + + + +static void __SIMD_fastpack12_32(const uint32_t * _in, __m128i * out) { + const __m128i *in = (const __m128i*)(_in); + __m128i OutReg; + + + const __m128i mask = _mm_set1_epi32((1U<<12)-1); + + __m128i InReg = _mm_and_si128(_mm_loadu_si128(in), mask); + OutReg = InReg; + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 12 - 4); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 12 - 8); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = InReg; + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 12 - 4); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 12 - 8); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = InReg; + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 12 - 4); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 12 - 8); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = InReg; + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 12 - 4); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 12 - 8); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + + +} + + + +static void __SIMD_fastpack13_32(const uint32_t * _in, __m128i * out) { + const __m128i *in = (const __m128i*)(_in); + __m128i OutReg; + + + const __m128i mask = _mm_set1_epi32((1U<<13)-1); + + __m128i InReg = _mm_and_si128(_mm_loadu_si128(in), mask); + OutReg = InReg; + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 13)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 26)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 13 - 7); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 7)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 13 - 1); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 1)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 14)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 27)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 13 - 8); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 21)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 13 - 2); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 2)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 15)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 13 - 9); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 9)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 22)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 13 - 3); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 3)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 29)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 13 - 10); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 10)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 23)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 13 - 4); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 17)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 13 - 11); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 11)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 13 - 5); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 5)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 18)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 31)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 13 - 12); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 25)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 13 - 6); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 6)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 19)); + _mm_storeu_si128(out, OutReg); + + +} + + + +static void __SIMD_fastpack14_32(const uint32_t * _in, __m128i * out) { + const __m128i *in = (const __m128i*)(_in); + __m128i OutReg; + + + const __m128i mask = _mm_set1_epi32((1U<<14)-1); + + __m128i InReg = _mm_and_si128(_mm_loadu_si128(in), mask); + OutReg = InReg; + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 14)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 14 - 10); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 10)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 14 - 6); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 6)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 14 - 2); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 2)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 14 - 12); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 26)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 14 - 8); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 22)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 14 - 4); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 18)); + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = InReg; + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 14)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 14 - 10); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 10)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 14 - 6); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 6)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 14 - 2); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 2)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 14 - 12); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 26)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 14 - 8); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 22)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 14 - 4); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 18)); + _mm_storeu_si128(out, OutReg); + + +} + + + +static void __SIMD_fastpack15_32(const uint32_t * _in, __m128i * out) { + const __m128i *in = (const __m128i*)(_in); + __m128i OutReg; + + + const __m128i mask = _mm_set1_epi32((1U<<15)-1); + + __m128i InReg = _mm_and_si128(_mm_loadu_si128(in), mask); + OutReg = InReg; + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 15)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 15 - 13); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 13)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 15 - 11); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 11)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 26)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 15 - 9); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 9)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 15 - 7); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 7)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 22)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 15 - 5); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 5)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 15 - 3); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 3)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 18)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 15 - 1); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 1)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 31)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 15 - 14); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 14)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 29)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 15 - 12); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 27)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 15 - 10); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 10)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 25)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 15 - 8); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 23)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 15 - 6); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 6)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 21)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 15 - 4); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 19)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 15 - 2); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 2)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 17)); + _mm_storeu_si128(out, OutReg); + + +} + + + +static void __SIMD_fastpack17_32(const uint32_t * _in, __m128i * out) { + const __m128i *in = (const __m128i*)(_in); + __m128i OutReg; + + + const __m128i mask = _mm_set1_epi32((1U<<17)-1); + + __m128i InReg = _mm_and_si128(_mm_loadu_si128(in), mask); + OutReg = InReg; + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 17)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 17 - 2); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 2)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 19)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 17 - 4); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 21)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 17 - 6); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 6)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 23)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 17 - 8); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 25)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 17 - 10); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 10)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 27)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 17 - 12); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 29)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 17 - 14); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 14)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 31)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 17 - 16); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 17 - 1); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 1)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 18)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 17 - 3); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 3)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 17 - 5); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 5)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 22)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 17 - 7); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 7)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 17 - 9); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 9)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 26)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 17 - 11); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 11)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 17 - 13); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 13)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 17 - 15); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 15)); + _mm_storeu_si128(out, OutReg); + + +} + + + +static void __SIMD_fastpack18_32(const uint32_t * _in, __m128i * out) { + const __m128i *in = (const __m128i*)(_in); + __m128i OutReg; + + + const __m128i mask = _mm_set1_epi32((1U<<18)-1); + + __m128i InReg = _mm_and_si128(_mm_loadu_si128(in), mask); + OutReg = InReg; + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 18)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 18 - 4); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 22)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 18 - 8); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 26)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 18 - 12); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 18 - 16); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 18 - 2); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 2)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 18 - 6); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 6)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 18 - 10); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 10)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 18 - 14); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 14)); + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = InReg; + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 18)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 18 - 4); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 22)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 18 - 8); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 26)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 18 - 12); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 18 - 16); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 18 - 2); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 2)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 18 - 6); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 6)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 18 - 10); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 10)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 18 - 14); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 14)); + _mm_storeu_si128(out, OutReg); + + +} + + + +static void __SIMD_fastpack19_32(const uint32_t * _in, __m128i * out) { + const __m128i *in = (const __m128i*)(_in); + __m128i OutReg; + + + const __m128i mask = _mm_set1_epi32((1U<<19)-1); + + __m128i InReg = _mm_and_si128(_mm_loadu_si128(in), mask); + OutReg = InReg; + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 19)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 19 - 6); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 6)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 25)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 19 - 12); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 31)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 19 - 18); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 18)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 19 - 5); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 5)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 19 - 11); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 11)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 19 - 17); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 17)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 19 - 4); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 23)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 19 - 10); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 10)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 29)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 19 - 16); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 19 - 3); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 3)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 22)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 19 - 9); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 9)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 19 - 15); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 15)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 19 - 2); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 2)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 21)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 19 - 8); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 27)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 19 - 14); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 14)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 19 - 1); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 1)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 19 - 7); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 7)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 26)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 19 - 13); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 13)); + _mm_storeu_si128(out, OutReg); + + +} + + + +static void __SIMD_fastpack20_32(const uint32_t * _in, __m128i * out) { + const __m128i *in = (const __m128i*)(_in); + __m128i OutReg; + + + const __m128i mask = _mm_set1_epi32((1U<<20)-1); + + __m128i InReg = _mm_and_si128(_mm_loadu_si128(in), mask); + OutReg = InReg; + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 20 - 8); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 20 - 16); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 20 - 4); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 20 - 12); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12)); + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = InReg; + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 20 - 8); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 20 - 16); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 20 - 4); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 20 - 12); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12)); + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = InReg; + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 20 - 8); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 20 - 16); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 20 - 4); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 20 - 12); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12)); + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = InReg; + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 20 - 8); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 20 - 16); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 20 - 4); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 20 - 12); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12)); + _mm_storeu_si128(out, OutReg); + + +} + + + +static void __SIMD_fastpack21_32(const uint32_t * _in, __m128i * out) { + const __m128i *in = (const __m128i*)(_in); + __m128i OutReg; + + + const __m128i mask = _mm_set1_epi32((1U<<21)-1); + + __m128i InReg = _mm_and_si128(_mm_loadu_si128(in), mask); + OutReg = InReg; + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 21)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 21 - 10); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 10)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 31)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 21 - 20); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 21 - 9); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 9)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 21 - 19); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 19)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 21 - 8); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 29)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 21 - 18); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 18)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 21 - 7); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 7)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 21 - 17); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 17)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 21 - 6); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 6)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 27)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 21 - 16); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 21 - 5); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 5)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 26)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 21 - 15); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 15)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 21 - 4); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 25)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 21 - 14); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 14)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 21 - 3); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 3)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 21 - 13); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 13)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 21 - 2); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 2)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 23)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 21 - 12); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 21 - 1); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 1)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 22)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 21 - 11); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 11)); + _mm_storeu_si128(out, OutReg); + + +} + + + +static void __SIMD_fastpack22_32(const uint32_t * _in, __m128i * out) { + const __m128i *in = (const __m128i*)(_in); + __m128i OutReg; + + + const __m128i mask = _mm_set1_epi32((1U<<22)-1); + + __m128i InReg = _mm_and_si128(_mm_loadu_si128(in), mask); + OutReg = InReg; + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 22)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 22 - 12); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 22 - 2); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 2)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 22 - 14); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 14)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 22 - 4); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 26)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 22 - 16); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 22 - 6); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 6)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 22 - 18); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 18)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 22 - 8); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 22 - 20); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 22 - 10); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 10)); + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = InReg; + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 22)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 22 - 12); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 22 - 2); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 2)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 22 - 14); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 14)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 22 - 4); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 26)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 22 - 16); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 22 - 6); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 6)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 22 - 18); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 18)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 22 - 8); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 22 - 20); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 22 - 10); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 10)); + _mm_storeu_si128(out, OutReg); + + +} + + + +static void __SIMD_fastpack23_32(const uint32_t * _in, __m128i * out) { + const __m128i *in = (const __m128i*)(_in); + __m128i OutReg; + + + const __m128i mask = _mm_set1_epi32((1U<<23)-1); + + __m128i InReg = _mm_and_si128(_mm_loadu_si128(in), mask); + OutReg = InReg; + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 23)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 14); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 14)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 5); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 5)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 19); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 19)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 10); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 10)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 1); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 1)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 15); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 15)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 6); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 6)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 29)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 20); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 11); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 11)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 2); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 2)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 25)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 16); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 7); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 7)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 21); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 21)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 12); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 3); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 3)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 26)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 17); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 17)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 8); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 31)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 22); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 22)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 13); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 13)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 4); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 27)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 18); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 18)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 9); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 9)); + _mm_storeu_si128(out, OutReg); + + +} + + + +static void __SIMD_fastpack24_32(const uint32_t * _in, __m128i * out) { + const __m128i *in = (const __m128i*)(_in); + __m128i OutReg; + + + const __m128i mask = _mm_set1_epi32((1U<<24)-1); + + __m128i InReg = _mm_and_si128(_mm_loadu_si128(in), mask); + OutReg = InReg; + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 24 - 16); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 24 - 8); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = InReg; + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 24 - 16); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 24 - 8); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = InReg; + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 24 - 16); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 24 - 8); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = InReg; + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 24 - 16); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 24 - 8); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = InReg; + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 24 - 16); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 24 - 8); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = InReg; + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 24 - 16); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 24 - 8); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = InReg; + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 24 - 16); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 24 - 8); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = InReg; + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 24 - 16); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 24 - 8); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); + _mm_storeu_si128(out, OutReg); + + +} + + + +static void __SIMD_fastpack25_32(const uint32_t * _in, __m128i * out) { + const __m128i *in = (const __m128i*)(_in); + __m128i OutReg; + + + const __m128i mask = _mm_set1_epi32((1U<<25)-1); + + __m128i InReg = _mm_and_si128(_mm_loadu_si128(in), mask); + OutReg = InReg; + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 25)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 18); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 18)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 11); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 11)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 4); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 29)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 22); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 22)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 15); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 15)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 8); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 1); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 1)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 26)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 19); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 19)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 12); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 5); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 5)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 23); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 23)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 16); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 9); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 9)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 2); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 2)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 27)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 20); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 13); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 13)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 6); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 6)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 31)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 24); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 17); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 17)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 10); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 10)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 3); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 3)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 21); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 21)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 14); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 14)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 7); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 7)); + _mm_storeu_si128(out, OutReg); + + +} + + + +static void __SIMD_fastpack26_32(const uint32_t * _in, __m128i * out) { + const __m128i *in = (const __m128i*)(_in); + __m128i OutReg; + + + const __m128i mask = _mm_set1_epi32((1U<<26)-1); + + __m128i InReg = _mm_and_si128(_mm_loadu_si128(in), mask); + OutReg = InReg; + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 26)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 20); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 14); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 14)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 8); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 2); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 2)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 22); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 22)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 16); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 10); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 10)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 4); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 24); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 18); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 18)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 12); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 6); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 6)); + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = InReg; + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 26)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 20); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 14); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 14)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 8); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 2); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 2)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 22); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 22)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 16); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 10); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 10)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 4); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 24); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 18); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 18)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 12); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 6); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 6)); + _mm_storeu_si128(out, OutReg); + + +} + + + +static void __SIMD_fastpack27_32(const uint32_t * _in, __m128i * out) { + const __m128i *in = (const __m128i*)(_in); + __m128i OutReg; + + + const __m128i mask = _mm_set1_epi32((1U<<27)-1); + + __m128i InReg = _mm_and_si128(_mm_loadu_si128(in), mask); + OutReg = InReg; + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 27)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 22); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 22)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 17); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 17)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 12); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 7); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 7)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 2); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 2)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 29)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 24); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 19); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 19)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 14); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 14)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 9); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 9)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 4); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 31)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 26); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 26)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 21); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 21)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 16); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 11); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 11)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 6); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 6)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 1); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 1)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 23); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 23)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 18); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 18)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 13); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 13)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 8); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 3); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 3)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 25); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 25)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 20); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 15); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 15)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 10); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 10)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 5); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 5)); + _mm_storeu_si128(out, OutReg); + + +} + + + +static void __SIMD_fastpack28_32(const uint32_t * _in, __m128i * out) { + const __m128i *in = (const __m128i*)(_in); + __m128i OutReg; + + + const __m128i mask = _mm_set1_epi32((1U<<28)-1); + + __m128i InReg = _mm_and_si128(_mm_loadu_si128(in), mask); + OutReg = InReg; + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 24); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 20); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 16); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 12); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 8); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 4); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4)); + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = InReg; + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 24); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 20); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 16); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 12); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 8); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 4); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4)); + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = InReg; + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 24); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 20); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 16); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 12); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 8); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 4); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4)); + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = InReg; + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 24); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 20); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 16); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 12); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 8); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 4); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4)); + _mm_storeu_si128(out, OutReg); + + +} + + + +static void __SIMD_fastpack29_32(const uint32_t * _in, __m128i * out) { + const __m128i *in = (const __m128i*)(_in); + __m128i OutReg; + + + const __m128i mask = _mm_set1_epi32((1U<<29)-1); + + __m128i InReg = _mm_and_si128(_mm_loadu_si128(in), mask); + OutReg = InReg; + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 29)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 26); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 26)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 23); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 23)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 20); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 17); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 17)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 14); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 14)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 11); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 11)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 8); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 5); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 5)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 2); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 2)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 31)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 28); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 25); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 25)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 22); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 22)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 19); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 19)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 16); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 13); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 13)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 10); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 10)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 7); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 7)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 4); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 1); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 1)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 27); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 27)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 24); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 21); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 21)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 18); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 18)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 15); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 15)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 12); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 9); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 9)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 6); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 6)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 3); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 3)); + _mm_storeu_si128(out, OutReg); + + +} + + + +static void __SIMD_fastpack30_32(const uint32_t * _in, __m128i * out) { + const __m128i *in = (const __m128i*)(_in); + __m128i OutReg; + + + const __m128i mask = _mm_set1_epi32((1U<<30)-1); + + __m128i InReg = _mm_and_si128(_mm_loadu_si128(in), mask); + OutReg = InReg; + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 28); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 26); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 26)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 24); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 22); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 22)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 20); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 18); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 18)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 16); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 14); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 14)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 12); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 10); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 10)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 8); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 6); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 6)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 4); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 2); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 2)); + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = InReg; + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 28); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 26); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 26)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 24); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 22); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 22)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 20); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 18); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 18)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 16); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 14); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 14)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 12); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 10); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 10)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 8); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 6); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 6)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 4); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 2); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 2)); + _mm_storeu_si128(out, OutReg); + + +} + + + +static void __SIMD_fastpack31_32(const uint32_t * _in, __m128i * out) { + const __m128i *in = (const __m128i*)(_in); + __m128i OutReg; + + + const __m128i mask = _mm_set1_epi32((1U<<31)-1); + + __m128i InReg = _mm_and_si128(_mm_loadu_si128(in), mask); + OutReg = InReg; + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 31)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 30); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 29); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 29)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 28); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 27); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 27)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 26); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 26)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 25); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 25)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 24); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 23); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 23)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 22); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 22)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 21); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 21)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 20); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 19); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 19)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 18); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 18)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 17); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 17)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 16); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 15); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 15)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 14); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 14)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 13); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 13)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 12); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 11); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 11)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 10); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 10)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 9); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 9)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 8); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 7); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 7)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 6); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 6)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 5); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 5)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 4); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 3); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 3)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 2); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 2)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 1); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 1)); + _mm_storeu_si128(out, OutReg); + + +} + + + +static void __SIMD_fastpack32_32(const uint32_t * _in, __m128i * out) { + const __m128i *in = (const __m128i*)(_in); + __m128i OutReg; + + __m128i InReg = _mm_loadu_si128(in); + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_loadu_si128(++in); + + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_loadu_si128(++in); + + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_loadu_si128(++in); + + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_loadu_si128(++in); + + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_loadu_si128(++in); + + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_loadu_si128(++in); + + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_loadu_si128(++in); + + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_loadu_si128(++in); + + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_loadu_si128(++in); + + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_loadu_si128(++in); + + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_loadu_si128(++in); + + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_loadu_si128(++in); + + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_loadu_si128(++in); + + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_loadu_si128(++in); + + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_loadu_si128(++in); + + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_loadu_si128(++in); + + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_loadu_si128(++in); + + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_loadu_si128(++in); + + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_loadu_si128(++in); + + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_loadu_si128(++in); + + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_loadu_si128(++in); + + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_loadu_si128(++in); + + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_loadu_si128(++in); + + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_loadu_si128(++in); + + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_loadu_si128(++in); + + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_loadu_si128(++in); + + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_loadu_si128(++in); + + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_loadu_si128(++in); + + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_loadu_si128(++in); + + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_loadu_si128(++in); + + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_loadu_si128(++in); + + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + + +} + + + +static void __SIMD_fastpack4_32(const uint32_t * _in, __m128i * out) { + const __m128i *in = (const __m128i*)(_in); + __m128i OutReg, InReg; + const __m128i mask = _mm_set1_epi32((1U<<4)-1); + uint32_t outer; + + + for(outer=0; outer< 4 ;++outer) { + InReg = _mm_and_si128(_mm_loadu_si128(in), mask); + OutReg = InReg; + + InReg = _mm_and_si128(_mm_loadu_si128(in+1), mask); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + + InReg = _mm_and_si128(_mm_loadu_si128(in+2), mask); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + + InReg = _mm_and_si128(_mm_loadu_si128(in+3), mask); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + + InReg = _mm_and_si128(_mm_loadu_si128(in+4), mask); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + + InReg = _mm_and_si128(_mm_loadu_si128(in+5), mask); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + + InReg = _mm_and_si128(_mm_loadu_si128(in+6), mask); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + + InReg = _mm_and_si128(_mm_loadu_si128(in+7), mask); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + ++out; + + in+=8; + } + +} + + + +static void __SIMD_fastpack8_32(const uint32_t * _in, __m128i * out) { + const __m128i *in = (const __m128i*)(_in); + __m128i OutReg, InReg; + const __m128i mask = _mm_set1_epi32((1U<<8)-1); + uint32_t outer; + + + for(outer=0; outer< 8 ;++outer) { + InReg = _mm_and_si128(_mm_loadu_si128(in), mask); + OutReg = InReg; + + InReg = _mm_and_si128(_mm_loadu_si128(in+1), mask); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + + InReg = _mm_and_si128(_mm_loadu_si128(in+2), mask); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + + InReg = _mm_and_si128(_mm_loadu_si128(in+3), mask); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + ++out; + + in+=4; + } + +} + + + +static void __SIMD_fastpack16_32(const uint32_t * _in, __m128i * out) { + const __m128i *in = (const __m128i*)(_in); + __m128i OutReg, InReg; + const __m128i mask = _mm_set1_epi32((1U<<16)-1); + uint32_t outer; + + + for(outer=0; outer< 16 ;++outer) { + InReg = _mm_and_si128(_mm_loadu_si128(in), mask); + OutReg = InReg; + + InReg = _mm_and_si128(_mm_loadu_si128(in+1), mask); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + ++out; + + in+=2; + } + +} + + + + +static void __SIMD_fastunpack1_32(const __m128i* in, uint32_t * _out) { + __m128i* out = (__m128i*)(_out); + __m128i InReg1 = _mm_loadu_si128(in); + __m128i InReg2 = InReg1; + __m128i OutReg1, OutReg2, OutReg3, OutReg4; + const __m128i mask = _mm_set1_epi32(1); + + uint32_t i, shift = 0; + + for (i = 0; i < 8; ++i) { + OutReg1 = _mm_and_si128( _mm_srli_epi32(InReg1,shift++) , mask); + OutReg2 = _mm_and_si128( _mm_srli_epi32(InReg2,shift++) , mask); + OutReg3 = _mm_and_si128( _mm_srli_epi32(InReg1,shift++) , mask); + OutReg4 = _mm_and_si128( _mm_srli_epi32(InReg2,shift++) , mask); + _mm_storeu_si128(out++, OutReg1); + _mm_storeu_si128(out++, OutReg2); + _mm_storeu_si128(out++, OutReg3); + _mm_storeu_si128(out++, OutReg4); + } +} + + + + +static void __SIMD_fastunpack2_32(const __m128i* in, uint32_t * _out) { + + __m128i* out = (__m128i*)(_out); + __m128i InReg = _mm_loadu_si128(in); + __m128i OutReg; + const __m128i mask = _mm_set1_epi32((1U<<2)-1); + + OutReg = _mm_and_si128( InReg , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,2) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,4) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,6) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,8) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,10) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,12) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,14) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,16) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,18) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,20) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,22) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,24) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,26) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,28) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,30) ; + InReg = _mm_loadu_si128(++in); + + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( InReg , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,2) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,4) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,6) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,8) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,10) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,12) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,14) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,16) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,18) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,20) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,22) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,24) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,26) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,28) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,30) ; + _mm_storeu_si128(out++, OutReg); + + +} + + + + +static void __SIMD_fastunpack3_32(const __m128i* in, uint32_t * _out) { + + __m128i* out = (__m128i*)(_out); + __m128i InReg = _mm_loadu_si128(in); + __m128i OutReg; + const __m128i mask = _mm_set1_epi32((1U<<3)-1); + + OutReg = _mm_and_si128( InReg , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,3) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,6) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,9) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,12) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,15) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,18) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,21) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,24) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,27) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,30) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 3-1), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,1) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,4) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,7) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,10) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,13) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,16) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,19) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,22) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,25) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,28) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,31) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 3-2), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,2) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,5) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,8) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,11) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,14) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,17) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,20) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,23) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,26) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,29) ; + _mm_storeu_si128(out++, OutReg); + + +} + + + + +static void __SIMD_fastunpack4_32(const __m128i* in, uint32_t * _out) { + + __m128i* out = (__m128i*)(_out); + __m128i InReg = _mm_loadu_si128(in); + __m128i OutReg; + const __m128i mask = _mm_set1_epi32((1U<<4)-1); + + OutReg = _mm_and_si128( InReg , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,4) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,8) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,12) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,16) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,20) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,24) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,28) ; + InReg = _mm_loadu_si128(++in); + + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( InReg , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,4) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,8) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,12) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,16) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,20) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,24) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,28) ; + InReg = _mm_loadu_si128(++in); + + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( InReg , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,4) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,8) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,12) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,16) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,20) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,24) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,28) ; + InReg = _mm_loadu_si128(++in); + + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( InReg , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,4) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,8) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,12) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,16) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,20) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,24) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,28) ; + _mm_storeu_si128(out++, OutReg); + + +} + + + + +static void __SIMD_fastunpack5_32(const __m128i* in, uint32_t * _out) { + + __m128i* out = (__m128i*)(_out); + __m128i InReg = _mm_loadu_si128(in); + __m128i OutReg; + const __m128i mask = _mm_set1_epi32((1U<<5)-1); + + OutReg = _mm_and_si128( InReg , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,5) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,10) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,15) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,20) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,25) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,30) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 5-3), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,3) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,8) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,13) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,18) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,23) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,28) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 5-1), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,1) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,6) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,11) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,16) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,21) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,26) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,31) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 5-4), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,4) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,9) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,14) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,19) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,24) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,29) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 5-2), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,2) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,7) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,12) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,17) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,22) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,27) ; + _mm_storeu_si128(out++, OutReg); + + +} + + + + +static void __SIMD_fastunpack6_32(const __m128i* in, uint32_t * _out) { + + __m128i* out = (__m128i*)(_out); + __m128i InReg = _mm_loadu_si128(in); + __m128i OutReg; + const __m128i mask = _mm_set1_epi32((1U<<6)-1); + + OutReg = _mm_and_si128( InReg , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,6) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,12) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,18) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,24) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,30) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 6-4), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,4) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,10) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,16) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,22) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,28) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 6-2), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,2) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,8) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,14) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,20) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,26) ; + InReg = _mm_loadu_si128(++in); + + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( InReg , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,6) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,12) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,18) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,24) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,30) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 6-4), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,4) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,10) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,16) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,22) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,28) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 6-2), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,2) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,8) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,14) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,20) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,26) ; + _mm_storeu_si128(out++, OutReg); + + +} + + + + +static void __SIMD_fastunpack7_32(const __m128i* in, uint32_t * _out) { + + __m128i* out = (__m128i*)(_out); + __m128i InReg = _mm_loadu_si128(in); + __m128i OutReg; + const __m128i mask = _mm_set1_epi32((1U<<7)-1); + + OutReg = _mm_and_si128( InReg , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,7) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,14) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,21) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,28) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 7-3), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,3) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,10) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,17) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,24) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,31) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 7-6), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,6) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,13) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,20) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,27) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 7-2), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,2) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,9) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,16) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,23) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,30) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 7-5), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,5) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,12) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,19) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,26) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 7-1), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,1) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,8) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,15) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,22) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,29) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 7-4), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,4) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,11) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,18) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,25) ; + _mm_storeu_si128(out++, OutReg); + + +} + + + + +static void __SIMD_fastunpack8_32(const __m128i* in, uint32_t * _out) { + + __m128i* out = (__m128i*)(_out); + __m128i InReg = _mm_loadu_si128(in); + __m128i OutReg; + const __m128i mask = _mm_set1_epi32((1U<<8)-1); + + OutReg = _mm_and_si128( InReg , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,8) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,16) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,24) ; + InReg = _mm_loadu_si128(++in); + + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( InReg , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,8) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,16) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,24) ; + InReg = _mm_loadu_si128(++in); + + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( InReg , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,8) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,16) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,24) ; + InReg = _mm_loadu_si128(++in); + + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( InReg , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,8) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,16) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,24) ; + InReg = _mm_loadu_si128(++in); + + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( InReg , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,8) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,16) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,24) ; + InReg = _mm_loadu_si128(++in); + + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( InReg , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,8) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,16) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,24) ; + InReg = _mm_loadu_si128(++in); + + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( InReg , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,8) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,16) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,24) ; + InReg = _mm_loadu_si128(++in); + + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( InReg , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,8) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,16) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,24) ; + _mm_storeu_si128(out++, OutReg); + + +} + + + + +static void __SIMD_fastunpack9_32(const __m128i* in, uint32_t * _out) { + + __m128i* out = (__m128i*)(_out); + __m128i InReg = _mm_loadu_si128(in); + __m128i OutReg; + const __m128i mask = _mm_set1_epi32((1U<<9)-1); + + OutReg = _mm_and_si128( InReg , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,9) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,18) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,27) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 9-4), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,4) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,13) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,22) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,31) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 9-8), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,8) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,17) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,26) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 9-3), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,3) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,12) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,21) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,30) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 9-7), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,7) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,16) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,25) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 9-2), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,2) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,11) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,20) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,29) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 9-6), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,6) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,15) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,24) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 9-1), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,1) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,10) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,19) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,28) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 9-5), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,5) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,14) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,23) ; + _mm_storeu_si128(out++, OutReg); + + +} + + + + +static void __SIMD_fastunpack10_32(const __m128i* in, uint32_t * _out) { + + __m128i* out = (__m128i*)(_out); + __m128i InReg = _mm_loadu_si128(in); + __m128i OutReg; + const __m128i mask = _mm_set1_epi32((1U<<10)-1); + + OutReg = _mm_and_si128( InReg , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,10) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,20) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,30) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 10-8), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,8) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,18) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,28) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 10-6), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,6) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,16) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,26) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 10-4), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,4) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,14) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,24) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 10-2), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,2) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,12) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,22) ; + InReg = _mm_loadu_si128(++in); + + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( InReg , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,10) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,20) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,30) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 10-8), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,8) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,18) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,28) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 10-6), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,6) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,16) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,26) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 10-4), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,4) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,14) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,24) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 10-2), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,2) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,12) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,22) ; + _mm_storeu_si128(out++, OutReg); + + +} + + + + +static void __SIMD_fastunpack11_32(const __m128i* in, uint32_t * _out) { + + __m128i* out = (__m128i*)(_out); + __m128i InReg = _mm_loadu_si128(in); + __m128i OutReg; + const __m128i mask = _mm_set1_epi32((1U<<11)-1); + + OutReg = _mm_and_si128( InReg , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,11) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,22) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 11-1), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,1) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,12) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,23) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 11-2), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,2) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,13) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,24) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 11-3), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,3) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,14) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,25) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 11-4), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,4) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,15) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,26) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 11-5), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,5) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,16) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,27) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 11-6), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,6) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,17) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,28) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 11-7), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,7) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,18) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,29) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 11-8), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,8) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,19) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,30) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 11-9), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,9) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,20) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,31) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 11-10), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,10) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,21) ; + _mm_storeu_si128(out++, OutReg); + + +} + + + + +static void __SIMD_fastunpack12_32(const __m128i* in, uint32_t * _out) { + + __m128i* out = (__m128i*)(_out); + __m128i InReg = _mm_loadu_si128(in); + __m128i OutReg; + const __m128i mask = _mm_set1_epi32((1U<<12)-1); + + OutReg = _mm_and_si128( InReg , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,12) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,24) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 12-4), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,4) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,16) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,28) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 12-8), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,8) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,20) ; + InReg = _mm_loadu_si128(++in); + + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( InReg , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,12) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,24) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 12-4), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,4) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,16) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,28) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 12-8), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,8) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,20) ; + InReg = _mm_loadu_si128(++in); + + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( InReg , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,12) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,24) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 12-4), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,4) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,16) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,28) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 12-8), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,8) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,20) ; + InReg = _mm_loadu_si128(++in); + + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( InReg , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,12) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,24) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 12-4), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,4) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,16) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,28) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 12-8), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,8) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,20) ; + _mm_storeu_si128(out++, OutReg); + + +} + + + + +static void __SIMD_fastunpack13_32(const __m128i* in, uint32_t * _out) { + + __m128i* out = (__m128i*)(_out); + __m128i InReg = _mm_loadu_si128(in); + __m128i OutReg; + const __m128i mask = _mm_set1_epi32((1U<<13)-1); + + OutReg = _mm_and_si128( InReg , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,13) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,26) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 13-7), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,7) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,20) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 13-1), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,1) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,14) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,27) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 13-8), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,8) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,21) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 13-2), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,2) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,15) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,28) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 13-9), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,9) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,22) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 13-3), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,3) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,16) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,29) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 13-10), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,10) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,23) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 13-4), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,4) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,17) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,30) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 13-11), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,11) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,24) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 13-5), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,5) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,18) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,31) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 13-12), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,12) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,25) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 13-6), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,6) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,19) ; + _mm_storeu_si128(out++, OutReg); + + +} + + + + +static void __SIMD_fastunpack14_32(const __m128i* in, uint32_t * _out) { + + __m128i* out = (__m128i*)(_out); + __m128i InReg = _mm_loadu_si128(in); + __m128i OutReg; + const __m128i mask = _mm_set1_epi32((1U<<14)-1); + + OutReg = _mm_and_si128( InReg , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,14) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,28) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 14-10), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,10) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,24) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 14-6), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,6) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,20) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 14-2), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,2) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,16) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,30) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 14-12), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,12) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,26) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 14-8), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,8) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,22) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 14-4), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,4) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,18) ; + InReg = _mm_loadu_si128(++in); + + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( InReg , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,14) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,28) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 14-10), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,10) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,24) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 14-6), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,6) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,20) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 14-2), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,2) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,16) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,30) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 14-12), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,12) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,26) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 14-8), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,8) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,22) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 14-4), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,4) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,18) ; + _mm_storeu_si128(out++, OutReg); + + +} + + + + +static void __SIMD_fastunpack15_32(const __m128i* in, uint32_t * _out) { + + __m128i* out = (__m128i*)(_out); + __m128i InReg = _mm_loadu_si128(in); + __m128i OutReg; + const __m128i mask = _mm_set1_epi32((1U<<15)-1); + + OutReg = _mm_and_si128( InReg , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,15) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,30) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 15-13), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,13) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,28) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 15-11), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,11) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,26) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 15-9), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,9) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,24) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 15-7), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,7) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,22) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 15-5), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,5) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,20) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 15-3), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,3) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,18) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 15-1), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,1) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,16) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,31) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 15-14), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,14) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,29) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 15-12), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,12) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,27) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 15-10), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,10) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,25) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 15-8), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,8) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,23) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 15-6), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,6) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,21) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 15-4), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,4) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,19) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 15-2), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,2) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,17) ; + _mm_storeu_si128(out++, OutReg); + + +} + + + + +static void __SIMD_fastunpack16_32(const __m128i* in, uint32_t * _out) { + + __m128i* out = (__m128i*)(_out); + __m128i InReg = _mm_loadu_si128(in); + __m128i OutReg; + const __m128i mask = _mm_set1_epi32((1U<<16)-1); + + OutReg = _mm_and_si128( InReg , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,16) ; + InReg = _mm_loadu_si128(++in); + + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( InReg , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,16) ; + InReg = _mm_loadu_si128(++in); + + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( InReg , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,16) ; + InReg = _mm_loadu_si128(++in); + + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( InReg , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,16) ; + InReg = _mm_loadu_si128(++in); + + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( InReg , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,16) ; + InReg = _mm_loadu_si128(++in); + + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( InReg , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,16) ; + InReg = _mm_loadu_si128(++in); + + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( InReg , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,16) ; + InReg = _mm_loadu_si128(++in); + + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( InReg , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,16) ; + InReg = _mm_loadu_si128(++in); + + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( InReg , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,16) ; + InReg = _mm_loadu_si128(++in); + + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( InReg , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,16) ; + InReg = _mm_loadu_si128(++in); + + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( InReg , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,16) ; + InReg = _mm_loadu_si128(++in); + + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( InReg , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,16) ; + InReg = _mm_loadu_si128(++in); + + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( InReg , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,16) ; + InReg = _mm_loadu_si128(++in); + + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( InReg , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,16) ; + InReg = _mm_loadu_si128(++in); + + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( InReg , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,16) ; + InReg = _mm_loadu_si128(++in); + + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( InReg , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,16) ; + _mm_storeu_si128(out++, OutReg); + + +} + + + + +static void __SIMD_fastunpack17_32(const __m128i* in, uint32_t * _out) { + + __m128i* out = (__m128i*)(_out); + __m128i InReg = _mm_loadu_si128(in); + __m128i OutReg; + const __m128i mask = _mm_set1_epi32((1U<<17)-1); + + OutReg = _mm_and_si128( InReg , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,17) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 17-2), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,2) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,19) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 17-4), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,4) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,21) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 17-6), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,6) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,23) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 17-8), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,8) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,25) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 17-10), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,10) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,27) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 17-12), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,12) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,29) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 17-14), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,14) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,31) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 17-16), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,16) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 17-1), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,1) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,18) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 17-3), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,3) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,20) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 17-5), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,5) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,22) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 17-7), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,7) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,24) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 17-9), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,9) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,26) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 17-11), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,11) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,28) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 17-13), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,13) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,30) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 17-15), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,15) ; + _mm_storeu_si128(out++, OutReg); + + +} + + + + +static void __SIMD_fastunpack18_32(const __m128i* in, uint32_t * _out) { + + __m128i* out = (__m128i*)(_out); + __m128i InReg = _mm_loadu_si128(in); + __m128i OutReg; + const __m128i mask = _mm_set1_epi32((1U<<18)-1); + + OutReg = _mm_and_si128( InReg , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,18) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 18-4), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,4) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,22) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 18-8), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,8) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,26) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 18-12), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,12) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,30) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 18-16), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,16) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 18-2), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,2) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,20) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 18-6), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,6) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,24) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 18-10), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,10) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,28) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 18-14), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,14) ; + InReg = _mm_loadu_si128(++in); + + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( InReg , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,18) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 18-4), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,4) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,22) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 18-8), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,8) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,26) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 18-12), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,12) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,30) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 18-16), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,16) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 18-2), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,2) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,20) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 18-6), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,6) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,24) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 18-10), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,10) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,28) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 18-14), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,14) ; + _mm_storeu_si128(out++, OutReg); + + +} + + + + +static void __SIMD_fastunpack19_32(const __m128i* in, uint32_t * _out) { + + __m128i* out = (__m128i*)(_out); + __m128i InReg = _mm_loadu_si128(in); + __m128i OutReg; + const __m128i mask = _mm_set1_epi32((1U<<19)-1); + + OutReg = _mm_and_si128( InReg , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,19) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 19-6), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,6) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,25) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 19-12), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,12) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,31) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 19-18), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,18) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 19-5), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,5) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,24) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 19-11), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,11) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,30) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 19-17), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,17) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 19-4), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,4) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,23) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 19-10), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,10) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,29) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 19-16), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,16) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 19-3), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,3) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,22) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 19-9), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,9) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,28) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 19-15), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,15) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 19-2), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,2) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,21) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 19-8), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,8) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,27) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 19-14), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,14) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 19-1), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,1) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,20) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 19-7), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,7) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,26) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 19-13), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,13) ; + _mm_storeu_si128(out++, OutReg); + + +} + + + + +static void __SIMD_fastunpack20_32(const __m128i* in, uint32_t * _out) { + + __m128i* out = (__m128i*)(_out); + __m128i InReg = _mm_loadu_si128(in); + __m128i OutReg; + const __m128i mask = _mm_set1_epi32((1U<<20)-1); + + OutReg = _mm_and_si128( InReg , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,20) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 20-8), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,8) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,28) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 20-16), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,16) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 20-4), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,4) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,24) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 20-12), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,12) ; + InReg = _mm_loadu_si128(++in); + + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( InReg , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,20) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 20-8), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,8) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,28) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 20-16), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,16) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 20-4), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,4) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,24) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 20-12), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,12) ; + InReg = _mm_loadu_si128(++in); + + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( InReg , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,20) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 20-8), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,8) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,28) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 20-16), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,16) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 20-4), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,4) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,24) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 20-12), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,12) ; + InReg = _mm_loadu_si128(++in); + + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( InReg , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,20) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 20-8), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,8) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,28) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 20-16), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,16) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 20-4), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,4) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,24) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 20-12), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,12) ; + _mm_storeu_si128(out++, OutReg); + + +} + + + + +static void __SIMD_fastunpack21_32(const __m128i* in, uint32_t * _out) { + + __m128i* out = (__m128i*)(_out); + __m128i InReg = _mm_loadu_si128(in); + __m128i OutReg; + const __m128i mask = _mm_set1_epi32((1U<<21)-1); + + OutReg = _mm_and_si128( InReg , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,21) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 21-10), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,10) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,31) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 21-20), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,20) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 21-9), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,9) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,30) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 21-19), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,19) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 21-8), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,8) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,29) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 21-18), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,18) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 21-7), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,7) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,28) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 21-17), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,17) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 21-6), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,6) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,27) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 21-16), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,16) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 21-5), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,5) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,26) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 21-15), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,15) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 21-4), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,4) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,25) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 21-14), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,14) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 21-3), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,3) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,24) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 21-13), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,13) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 21-2), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,2) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,23) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 21-12), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,12) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 21-1), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,1) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,22) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 21-11), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,11) ; + _mm_storeu_si128(out++, OutReg); + + +} + + + + +static void __SIMD_fastunpack22_32(const __m128i* in, uint32_t * _out) { + + __m128i* out = (__m128i*)(_out); + __m128i InReg = _mm_loadu_si128(in); + __m128i OutReg; + const __m128i mask = _mm_set1_epi32((1U<<22)-1); + + OutReg = _mm_and_si128( InReg , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,22) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 22-12), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,12) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 22-2), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,2) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,24) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 22-14), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,14) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 22-4), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,4) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,26) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 22-16), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,16) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 22-6), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,6) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,28) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 22-18), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,18) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 22-8), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,8) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,30) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 22-20), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,20) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 22-10), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,10) ; + InReg = _mm_loadu_si128(++in); + + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( InReg , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,22) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 22-12), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,12) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 22-2), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,2) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,24) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 22-14), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,14) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 22-4), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,4) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,26) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 22-16), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,16) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 22-6), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,6) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,28) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 22-18), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,18) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 22-8), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,8) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,30) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 22-20), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,20) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 22-10), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,10) ; + _mm_storeu_si128(out++, OutReg); + + +} + + + + +static void __SIMD_fastunpack23_32(const __m128i* in, uint32_t * _out) { + + __m128i* out = (__m128i*)(_out); + __m128i InReg = _mm_loadu_si128(in); + __m128i OutReg; + const __m128i mask = _mm_set1_epi32((1U<<23)-1); + + OutReg = _mm_and_si128( InReg , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,23) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 23-14), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,14) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 23-5), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,5) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,28) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 23-19), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,19) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 23-10), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,10) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 23-1), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,1) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,24) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 23-15), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,15) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 23-6), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,6) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,29) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 23-20), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,20) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 23-11), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,11) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 23-2), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,2) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,25) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 23-16), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,16) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 23-7), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,7) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,30) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 23-21), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,21) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 23-12), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,12) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 23-3), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,3) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,26) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 23-17), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,17) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 23-8), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,8) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,31) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 23-22), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,22) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 23-13), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,13) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 23-4), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,4) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,27) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 23-18), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,18) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 23-9), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,9) ; + _mm_storeu_si128(out++, OutReg); + + +} + + + + +static void __SIMD_fastunpack24_32(const __m128i* in, uint32_t * _out) { + + __m128i* out = (__m128i*)(_out); + __m128i InReg = _mm_loadu_si128(in); + __m128i OutReg; + const __m128i mask = _mm_set1_epi32((1U<<24)-1); + + OutReg = _mm_and_si128( InReg , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,24) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 24-16), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,16) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 24-8), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,8) ; + InReg = _mm_loadu_si128(++in); + + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( InReg , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,24) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 24-16), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,16) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 24-8), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,8) ; + InReg = _mm_loadu_si128(++in); + + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( InReg , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,24) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 24-16), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,16) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 24-8), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,8) ; + InReg = _mm_loadu_si128(++in); + + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( InReg , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,24) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 24-16), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,16) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 24-8), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,8) ; + InReg = _mm_loadu_si128(++in); + + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( InReg , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,24) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 24-16), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,16) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 24-8), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,8) ; + InReg = _mm_loadu_si128(++in); + + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( InReg , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,24) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 24-16), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,16) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 24-8), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,8) ; + InReg = _mm_loadu_si128(++in); + + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( InReg , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,24) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 24-16), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,16) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 24-8), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,8) ; + InReg = _mm_loadu_si128(++in); + + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( InReg , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,24) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 24-16), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,16) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 24-8), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,8) ; + _mm_storeu_si128(out++, OutReg); + + +} + + + + +static void __SIMD_fastunpack25_32(const __m128i* in, uint32_t * _out) { + + __m128i* out = (__m128i*)(_out); + __m128i InReg = _mm_loadu_si128(in); + __m128i OutReg; + const __m128i mask = _mm_set1_epi32((1U<<25)-1); + + OutReg = _mm_and_si128( InReg , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,25) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 25-18), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,18) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 25-11), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,11) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 25-4), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,4) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,29) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 25-22), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,22) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 25-15), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,15) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 25-8), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,8) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 25-1), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,1) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,26) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 25-19), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,19) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 25-12), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,12) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 25-5), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,5) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,30) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 25-23), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,23) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 25-16), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,16) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 25-9), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,9) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 25-2), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,2) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,27) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 25-20), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,20) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 25-13), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,13) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 25-6), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,6) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,31) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 25-24), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,24) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 25-17), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,17) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 25-10), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,10) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 25-3), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,3) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,28) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 25-21), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,21) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 25-14), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,14) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 25-7), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,7) ; + _mm_storeu_si128(out++, OutReg); + + +} + + + + +static void __SIMD_fastunpack26_32(const __m128i* in, uint32_t * _out) { + + __m128i* out = (__m128i*)(_out); + __m128i InReg = _mm_loadu_si128(in); + __m128i OutReg; + const __m128i mask = _mm_set1_epi32((1U<<26)-1); + + OutReg = _mm_and_si128( InReg , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,26) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 26-20), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,20) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 26-14), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,14) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 26-8), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,8) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 26-2), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,2) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,28) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 26-22), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,22) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 26-16), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,16) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 26-10), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,10) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 26-4), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,4) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,30) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 26-24), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,24) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 26-18), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,18) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 26-12), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,12) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 26-6), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,6) ; + InReg = _mm_loadu_si128(++in); + + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( InReg , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,26) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 26-20), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,20) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 26-14), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,14) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 26-8), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,8) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 26-2), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,2) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,28) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 26-22), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,22) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 26-16), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,16) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 26-10), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,10) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 26-4), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,4) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,30) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 26-24), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,24) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 26-18), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,18) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 26-12), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,12) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 26-6), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,6) ; + _mm_storeu_si128(out++, OutReg); + + +} + + + + +static void __SIMD_fastunpack27_32(const __m128i* in, uint32_t * _out) { + + __m128i* out = (__m128i*)(_out); + __m128i InReg = _mm_loadu_si128(in); + __m128i OutReg; + const __m128i mask = _mm_set1_epi32((1U<<27)-1); + + OutReg = _mm_and_si128( InReg , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,27) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27-22), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,22) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27-17), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,17) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27-12), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,12) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27-7), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,7) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27-2), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,2) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,29) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27-24), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,24) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27-19), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,19) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27-14), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,14) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27-9), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,9) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27-4), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,4) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,31) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27-26), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,26) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27-21), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,21) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27-16), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,16) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27-11), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,11) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27-6), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,6) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27-1), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,1) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,28) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27-23), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,23) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27-18), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,18) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27-13), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,13) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27-8), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,8) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27-3), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,3) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,30) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27-25), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,25) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27-20), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,20) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27-15), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,15) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27-10), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,10) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27-5), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,5) ; + _mm_storeu_si128(out++, OutReg); + + +} + + + + +static void __SIMD_fastunpack28_32(const __m128i* in, uint32_t * _out) { + + __m128i* out = (__m128i*)(_out); + __m128i InReg = _mm_loadu_si128(in); + __m128i OutReg; + const __m128i mask = _mm_set1_epi32((1U<<28)-1); + + OutReg = _mm_and_si128( InReg , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,28) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 28-24), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,24) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 28-20), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,20) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 28-16), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,16) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 28-12), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,12) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 28-8), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,8) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 28-4), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,4) ; + InReg = _mm_loadu_si128(++in); + + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( InReg , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,28) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 28-24), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,24) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 28-20), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,20) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 28-16), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,16) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 28-12), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,12) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 28-8), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,8) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 28-4), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,4) ; + InReg = _mm_loadu_si128(++in); + + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( InReg , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,28) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 28-24), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,24) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 28-20), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,20) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 28-16), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,16) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 28-12), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,12) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 28-8), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,8) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 28-4), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,4) ; + InReg = _mm_loadu_si128(++in); + + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( InReg , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,28) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 28-24), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,24) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 28-20), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,20) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 28-16), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,16) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 28-12), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,12) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 28-8), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,8) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 28-4), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,4) ; + _mm_storeu_si128(out++, OutReg); + + +} + + + + +static void __SIMD_fastunpack29_32(const __m128i* in, uint32_t * _out) { + + __m128i* out = (__m128i*)(_out); + __m128i InReg = _mm_loadu_si128(in); + __m128i OutReg; + const __m128i mask = _mm_set1_epi32((1U<<29)-1); + + OutReg = _mm_and_si128( InReg , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,29) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29-26), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,26) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29-23), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,23) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29-20), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,20) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29-17), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,17) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29-14), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,14) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29-11), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,11) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29-8), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,8) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29-5), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,5) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29-2), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,2) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,31) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29-28), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,28) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29-25), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,25) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29-22), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,22) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29-19), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,19) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29-16), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,16) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29-13), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,13) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29-10), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,10) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29-7), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,7) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29-4), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,4) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29-1), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,1) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,30) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29-27), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,27) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29-24), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,24) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29-21), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,21) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29-18), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,18) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29-15), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,15) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29-12), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,12) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29-9), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,9) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29-6), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,6) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29-3), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,3) ; + _mm_storeu_si128(out++, OutReg); + + +} + + + + +static void __SIMD_fastunpack30_32(const __m128i* in, uint32_t * _out) { + + __m128i* out = (__m128i*)(_out); + __m128i InReg = _mm_loadu_si128(in); + __m128i OutReg; + const __m128i mask = _mm_set1_epi32((1U<<30)-1); + + OutReg = _mm_and_si128( InReg , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,30) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30-28), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,28) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30-26), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,26) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30-24), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,24) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30-22), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,22) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30-20), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,20) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30-18), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,18) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30-16), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,16) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30-14), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,14) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30-12), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,12) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30-10), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,10) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30-8), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,8) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30-6), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,6) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30-4), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,4) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30-2), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,2) ; + InReg = _mm_loadu_si128(++in); + + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( InReg , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,30) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30-28), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,28) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30-26), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,26) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30-24), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,24) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30-22), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,22) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30-20), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,20) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30-18), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,18) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30-16), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,16) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30-14), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,14) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30-12), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,12) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30-10), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,10) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30-8), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,8) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30-6), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,6) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30-4), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,4) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30-2), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,2) ; + _mm_storeu_si128(out++, OutReg); + + +} + + + + +static void __SIMD_fastunpack31_32(const __m128i* in, uint32_t * _out) { + + __m128i* out = (__m128i*)(_out); + __m128i InReg = _mm_loadu_si128(in); + __m128i OutReg; + const __m128i mask = _mm_set1_epi32((1U<<31)-1); + + OutReg = _mm_and_si128( InReg , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,31) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31-30), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,30) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31-29), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,29) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31-28), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,28) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31-27), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,27) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31-26), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,26) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31-25), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,25) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31-24), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,24) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31-23), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,23) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31-22), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,22) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31-21), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,21) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31-20), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,20) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31-19), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,19) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31-18), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,18) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31-17), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,17) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31-16), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,16) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31-15), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,15) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31-14), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,14) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31-13), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,13) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31-12), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,12) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31-11), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,11) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31-10), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,10) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31-9), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,9) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31-8), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,8) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31-7), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,7) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31-6), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,6) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31-5), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,5) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31-4), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,4) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31-3), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,3) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31-2), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,2) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31-1), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,1) ; + _mm_storeu_si128(out++, OutReg); + + +} + + +void __SIMD_fastunpack32_32(const __m128i* in, uint32_t * _out) { + __m128i* out = (__m128i*)(_out); + uint32_t outer; + + for(outer=0; outer< 32 ;++outer) { + _mm_storeu_si128(out++, _mm_loadu_si128(in++)); + } +} + + + +void simdunpack(const __m128i * in, uint32_t * out, const uint32_t bit) { + switch(bit) { + case 0: SIMD_nullunpacker32(in,out); return; + + case 1: __SIMD_fastunpack1_32(in,out); return; + + case 2: __SIMD_fastunpack2_32(in,out); return; + + case 3: __SIMD_fastunpack3_32(in,out); return; + + case 4: __SIMD_fastunpack4_32(in,out); return; + + case 5: __SIMD_fastunpack5_32(in,out); return; + + case 6: __SIMD_fastunpack6_32(in,out); return; + + case 7: __SIMD_fastunpack7_32(in,out); return; + + case 8: __SIMD_fastunpack8_32(in,out); return; + + case 9: __SIMD_fastunpack9_32(in,out); return; + + case 10: __SIMD_fastunpack10_32(in,out); return; + + case 11: __SIMD_fastunpack11_32(in,out); return; + + case 12: __SIMD_fastunpack12_32(in,out); return; + + case 13: __SIMD_fastunpack13_32(in,out); return; + + case 14: __SIMD_fastunpack14_32(in,out); return; + + case 15: __SIMD_fastunpack15_32(in,out); return; + + case 16: __SIMD_fastunpack16_32(in,out); return; + + case 17: __SIMD_fastunpack17_32(in,out); return; + + case 18: __SIMD_fastunpack18_32(in,out); return; + + case 19: __SIMD_fastunpack19_32(in,out); return; + + case 20: __SIMD_fastunpack20_32(in,out); return; + + case 21: __SIMD_fastunpack21_32(in,out); return; + + case 22: __SIMD_fastunpack22_32(in,out); return; + + case 23: __SIMD_fastunpack23_32(in,out); return; + + case 24: __SIMD_fastunpack24_32(in,out); return; + + case 25: __SIMD_fastunpack25_32(in,out); return; + + case 26: __SIMD_fastunpack26_32(in,out); return; + + case 27: __SIMD_fastunpack27_32(in,out); return; + + case 28: __SIMD_fastunpack28_32(in,out); return; + + case 29: __SIMD_fastunpack29_32(in,out); return; + + case 30: __SIMD_fastunpack30_32(in,out); return; + + case 31: __SIMD_fastunpack31_32(in,out); return; + + case 32: __SIMD_fastunpack32_32(in,out); return; + + default: break; + } +} + + + + /*assumes that integers fit in the prescribed number of bits*/ +void simdpackwithoutmask(const uint32_t * in, __m128i * out, const uint32_t bit) { + switch(bit) { + case 0: return; + + case 1: __SIMD_fastpackwithoutmask1_32(in,out); return; + + case 2: __SIMD_fastpackwithoutmask2_32(in,out); return; + + case 3: __SIMD_fastpackwithoutmask3_32(in,out); return; + + case 4: __SIMD_fastpackwithoutmask4_32(in,out); return; + + case 5: __SIMD_fastpackwithoutmask5_32(in,out); return; + + case 6: __SIMD_fastpackwithoutmask6_32(in,out); return; + + case 7: __SIMD_fastpackwithoutmask7_32(in,out); return; + + case 8: __SIMD_fastpackwithoutmask8_32(in,out); return; + + case 9: __SIMD_fastpackwithoutmask9_32(in,out); return; + + case 10: __SIMD_fastpackwithoutmask10_32(in,out); return; + + case 11: __SIMD_fastpackwithoutmask11_32(in,out); return; + + case 12: __SIMD_fastpackwithoutmask12_32(in,out); return; + + case 13: __SIMD_fastpackwithoutmask13_32(in,out); return; + + case 14: __SIMD_fastpackwithoutmask14_32(in,out); return; + + case 15: __SIMD_fastpackwithoutmask15_32(in,out); return; + + case 16: __SIMD_fastpackwithoutmask16_32(in,out); return; + + case 17: __SIMD_fastpackwithoutmask17_32(in,out); return; + + case 18: __SIMD_fastpackwithoutmask18_32(in,out); return; + + case 19: __SIMD_fastpackwithoutmask19_32(in,out); return; + + case 20: __SIMD_fastpackwithoutmask20_32(in,out); return; + + case 21: __SIMD_fastpackwithoutmask21_32(in,out); return; + + case 22: __SIMD_fastpackwithoutmask22_32(in,out); return; + + case 23: __SIMD_fastpackwithoutmask23_32(in,out); return; + + case 24: __SIMD_fastpackwithoutmask24_32(in,out); return; + + case 25: __SIMD_fastpackwithoutmask25_32(in,out); return; + + case 26: __SIMD_fastpackwithoutmask26_32(in,out); return; + + case 27: __SIMD_fastpackwithoutmask27_32(in,out); return; + + case 28: __SIMD_fastpackwithoutmask28_32(in,out); return; + + case 29: __SIMD_fastpackwithoutmask29_32(in,out); return; + + case 30: __SIMD_fastpackwithoutmask30_32(in,out); return; + + case 31: __SIMD_fastpackwithoutmask31_32(in,out); return; + + case 32: __SIMD_fastpackwithoutmask32_32(in,out); return; + + default: break; + } +} + + + + /*assumes that integers fit in the prescribed number of bits*/ +void simdpack(const uint32_t * in, __m128i * out, const uint32_t bit) { + switch(bit) { + case 0: return; + + case 1: __SIMD_fastpack1_32(in,out); return; + + case 2: __SIMD_fastpack2_32(in,out); return; + + case 3: __SIMD_fastpack3_32(in,out); return; + + case 4: __SIMD_fastpack4_32(in,out); return; + + case 5: __SIMD_fastpack5_32(in,out); return; + + case 6: __SIMD_fastpack6_32(in,out); return; + + case 7: __SIMD_fastpack7_32(in,out); return; + + case 8: __SIMD_fastpack8_32(in,out); return; + + case 9: __SIMD_fastpack9_32(in,out); return; + + case 10: __SIMD_fastpack10_32(in,out); return; + + case 11: __SIMD_fastpack11_32(in,out); return; + + case 12: __SIMD_fastpack12_32(in,out); return; + + case 13: __SIMD_fastpack13_32(in,out); return; + + case 14: __SIMD_fastpack14_32(in,out); return; + + case 15: __SIMD_fastpack15_32(in,out); return; + + case 16: __SIMD_fastpack16_32(in,out); return; + + case 17: __SIMD_fastpack17_32(in,out); return; + + case 18: __SIMD_fastpack18_32(in,out); return; + + case 19: __SIMD_fastpack19_32(in,out); return; + + case 20: __SIMD_fastpack20_32(in,out); return; + + case 21: __SIMD_fastpack21_32(in,out); return; + + case 22: __SIMD_fastpack22_32(in,out); return; + + case 23: __SIMD_fastpack23_32(in,out); return; + + case 24: __SIMD_fastpack24_32(in,out); return; + + case 25: __SIMD_fastpack25_32(in,out); return; + + case 26: __SIMD_fastpack26_32(in,out); return; + + case 27: __SIMD_fastpack27_32(in,out); return; + + case 28: __SIMD_fastpack28_32(in,out); return; + + case 29: __SIMD_fastpack29_32(in,out); return; + + case 30: __SIMD_fastpack30_32(in,out); return; + + case 31: __SIMD_fastpack31_32(in,out); return; + + case 32: __SIMD_fastpack32_32(in,out); return; + + default: break; + } +} + + + +__m128i * simdpack_shortlength( const uint32_t * in, int length, __m128i * out, const uint32_t bit) { + int k; + int inwordpointer; + __m128i P; + uint32_t firstpass; + if(bit == 0) return out;/* nothing to do */ + if(bit == 32) { + memcpy(out,in,length*sizeof(uint32_t)); + return (__m128i *)((uint32_t *) out + length); + } + inwordpointer = 0; + P = _mm_setzero_si128(); + for(k = 0; k < length / 4 ; ++k) { + __m128i value = _mm_loadu_si128(((const __m128i * ) in + k)); + P = _mm_or_si128(P,_mm_slli_epi32(value, inwordpointer)); + firstpass = sizeof(uint32_t) * 8 - inwordpointer; + if(bit> firstbits;/* we write */ + return; + } +} + +int simdpack_compressedbytes(int length, const uint32_t bit) { + if(bit == 0) return 0;/* nothing to do */ + if(bit == 32) { + return length * sizeof(uint32_t); + } + return (((length + 3 )/ 4) * bit + 31 ) / 32 * sizeof(__m128i); +} + +__m128i * simdpack_length(const uint32_t * in, size_t length, __m128i * out, const uint32_t bit) { + size_t k; + for(k = 0; k < length / SIMDBlockSize; ++k) { + simdpack(in, out, bit); + in += SIMDBlockSize; + out += bit; + } + return simdpack_shortlength(in, length % SIMDBlockSize, out, bit); +} + +const __m128i * simdunpack_length(const __m128i * in, size_t length, uint32_t * out, const uint32_t bit) { + size_t k; + for(k = 0; k < length / SIMDBlockSize; ++k) { + simdunpack(in, out, bit); + out += SIMDBlockSize; + in += bit; + } + return simdunpack_shortlength(in, length % SIMDBlockSize, out, bit); +} diff --git a/src/simdcomputil.c b/src/simdcomputil.c new file mode 100644 index 000000000..8cdca6d84 --- /dev/null +++ b/src/simdcomputil.c @@ -0,0 +1,234 @@ +/** + * This code is released under a BSD License. + */ + +#include "simdcomputil.h" +#ifdef __SSE4_1__ +#include +#endif +#include + +#define Delta(curr, prev) \ + _mm_sub_epi32(curr, \ + _mm_or_si128(_mm_slli_si128(curr, 4), _mm_srli_si128(prev, 12))) + +/* returns the integer logarithm of v (bit width) */ +uint32_t bits(const uint32_t v) { +#ifdef _MSC_VER + unsigned long answer; + if (v == 0) { + return 0; + } + _BitScanReverse(&answer, v); + return answer + 1; +#else + return v == 0 ? 0 : 32 - __builtin_clz(v); /* assume GCC-like compiler if not microsoft */ +#endif +} + + + +static uint32_t maxbitas32int(const __m128i accumulator) { + const __m128i _tmp1 = _mm_or_si128(_mm_srli_si128(accumulator, 8), accumulator); /* (A,B,C,D) xor (0,0,A,B) = (A,B,C xor A,D xor B)*/ + const __m128i _tmp2 = _mm_or_si128(_mm_srli_si128(_tmp1, 4), _tmp1); /* (A,B,C xor A,D xor B) xor (0,0,0,C xor A)*/ + uint32_t ans = _mm_cvtsi128_si32(_tmp2); + return bits(ans); +} + +SIMDCOMP_PURE uint32_t maxbits(const uint32_t * begin) { + const __m128i* pin = (const __m128i*)(begin); + __m128i accumulator = _mm_loadu_si128(pin); + uint32_t k = 1; + for(; 4*k < SIMDBlockSize; ++k) { + __m128i newvec = _mm_loadu_si128(pin+k); + accumulator = _mm_or_si128(accumulator,newvec); + } + return maxbitas32int(accumulator); +} +static uint32_t orasint(const __m128i accumulator) { + const __m128i _tmp1 = _mm_or_si128(_mm_srli_si128(accumulator, 8), accumulator); /* (A,B,C,D) xor (0,0,A,B) = (A,B,C xor A,D xor B)*/ + const __m128i _tmp2 = _mm_or_si128(_mm_srli_si128(_tmp1, 4), _tmp1); /* (A,B,C xor A,D xor B) xor (0,0,0,C xor A)*/ + return _mm_cvtsi128_si32(_tmp2); +} + +#ifdef __SSE4_1__ + +static uint32_t minasint(const __m128i accumulator) { + const __m128i _tmp1 = _mm_min_epu32(_mm_srli_si128(accumulator, 8), accumulator); /* (A,B,C,D) xor (0,0,A,B) = (A,B,C xor A,D xor B)*/ + const __m128i _tmp2 = _mm_min_epu32(_mm_srli_si128(_tmp1, 4), _tmp1); /* (A,B,C xor A,D xor B) xor (0,0,0,C xor A)*/ + return _mm_cvtsi128_si32(_tmp2); +} + +static uint32_t maxasint(const __m128i accumulator) { + const __m128i _tmp1 = _mm_max_epu32(_mm_srli_si128(accumulator, 8), accumulator); /* (A,B,C,D) xor (0,0,A,B) = (A,B,C xor A,D xor B)*/ + const __m128i _tmp2 = _mm_max_epu32(_mm_srli_si128(_tmp1, 4), _tmp1); /* (A,B,C xor A,D xor B) xor (0,0,0,C xor A)*/ + return _mm_cvtsi128_si32(_tmp2); +} + +uint32_t simdmin(const uint32_t * in) { + const __m128i* pin = (const __m128i*)(in); + __m128i accumulator = _mm_loadu_si128(pin); + uint32_t k = 1; + for(; 4*k < SIMDBlockSize; ++k) { + __m128i newvec = _mm_loadu_si128(pin+k); + accumulator = _mm_min_epu32(accumulator,newvec); + } + return minasint(accumulator); +} + +void simdmaxmin(const uint32_t * in, uint32_t * getmin, uint32_t * getmax) { + const __m128i* pin = (const __m128i*)(in); + __m128i minaccumulator = _mm_loadu_si128(pin); + __m128i maxaccumulator = minaccumulator; + uint32_t k = 1; + for(; 4*k < SIMDBlockSize; ++k) { + __m128i newvec = _mm_loadu_si128(pin+k); + minaccumulator = _mm_min_epu32(minaccumulator,newvec); + maxaccumulator = _mm_max_epu32(maxaccumulator,newvec); + } + *getmin = minasint(minaccumulator); + *getmax = maxasint(maxaccumulator); +} + + +uint32_t simdmin_length(const uint32_t * in, uint32_t length) { + uint32_t currentmin = 0xFFFFFFFF; + uint32_t lengthdividedby4 = length / 4; + uint32_t offset = lengthdividedby4 * 4; + uint32_t k; + if (lengthdividedby4 > 0) { + const __m128i* pin = (const __m128i*)(in); + __m128i accumulator = _mm_loadu_si128(pin); + k = 1; + for(; 4*k < lengthdividedby4 * 4; ++k) { + __m128i newvec = _mm_loadu_si128(pin+k); + accumulator = _mm_min_epu32(accumulator,newvec); + } + currentmin = minasint(accumulator); + } + for (k = offset; k < length; ++k) + if (in[k] < currentmin) + currentmin = in[k]; + return currentmin; +} + +void simdmaxmin_length(const uint32_t * in, uint32_t length, uint32_t * getmin, uint32_t * getmax) { + uint32_t lengthdividedby4 = length / 4; + uint32_t offset = lengthdividedby4 * 4; + uint32_t k; + *getmin = 0xFFFFFFFF; + *getmax = 0; + if (lengthdividedby4 > 0) { + const __m128i* pin = (const __m128i*)(in); + __m128i minaccumulator = _mm_loadu_si128(pin); + __m128i maxaccumulator = minaccumulator; + k = 1; + for(; 4*k < lengthdividedby4 * 4; ++k) { + __m128i newvec = _mm_loadu_si128(pin+k); + minaccumulator = _mm_min_epu32(minaccumulator,newvec); + maxaccumulator = _mm_max_epu32(maxaccumulator,newvec); + } + *getmin = minasint(minaccumulator); + *getmax = maxasint(maxaccumulator); + } + for (k = offset; k < length; ++k) { + if (in[k] < *getmin) + *getmin = in[k]; + if (in[k] > *getmax) + *getmax = in[k]; + } +} + +#endif + +SIMDCOMP_PURE uint32_t maxbits_length(const uint32_t * in,uint32_t length) { + uint32_t k; + uint32_t lengthdividedby4 = length / 4; + uint32_t offset = lengthdividedby4 * 4; + uint32_t bigxor = 0; + if(lengthdividedby4 > 0) { + const __m128i* pin = (const __m128i*)(in); + __m128i accumulator = _mm_loadu_si128(pin); + k = 1; + for(; 4*k < 4*lengthdividedby4; ++k) { + __m128i newvec = _mm_loadu_si128(pin+k); + accumulator = _mm_or_si128(accumulator,newvec); + } + bigxor = orasint(accumulator); + } + for(k = offset; k < length; ++k) + bigxor |= in[k]; + return bits(bigxor); +} + + +/* maxbit over 128 integers (SIMDBlockSize) with provided initial value */ +uint32_t simdmaxbitsd1(uint32_t initvalue, const uint32_t * in) { + __m128i initoffset = _mm_set1_epi32 (initvalue); + const __m128i* pin = (const __m128i*)(in); + __m128i newvec = _mm_loadu_si128(pin); + __m128i accumulator = Delta(newvec , initoffset); + __m128i oldvec = newvec; + uint32_t k = 1; + for(; 4*k < SIMDBlockSize; ++k) { + newvec = _mm_loadu_si128(pin+k); + accumulator = _mm_or_si128(accumulator,Delta(newvec , oldvec)); + oldvec = newvec; + } + initoffset = oldvec; + return maxbitas32int(accumulator); +} + + +/* maxbit over |length| integers with provided initial value */ +uint32_t simdmaxbitsd1_length(uint32_t initvalue, const uint32_t * in, + uint32_t length) { + __m128i newvec; + __m128i oldvec; + __m128i initoffset; + __m128i accumulator; + const __m128i *pin; + uint32_t tmparray[4]; + uint32_t k = 1; + uint32_t acc; + + assert(length > 0); + + pin = (const __m128i *)(in); + initoffset = _mm_set1_epi32(initvalue); + switch (length) { + case 1: + newvec = _mm_set1_epi32(in[0]); + break; + case 2: + newvec = _mm_setr_epi32(in[0], in[1], in[1], in[1]); + break; + case 3: + newvec = _mm_setr_epi32(in[0], in[1], in[2], in[2]); + break; + default: + newvec = _mm_loadu_si128(pin); + break; + } + accumulator = Delta(newvec, initoffset); + oldvec = newvec; + + /* process 4 integers and build an accumulator */ + while (k * 4 + 4 <= length) { + newvec = _mm_loadu_si128(pin + k); + accumulator = _mm_or_si128(accumulator, Delta(newvec, oldvec)); + oldvec = newvec; + k++; + } + + /* extract the accumulator as an integer */ + _mm_storeu_si128((__m128i *)(tmparray), accumulator); + acc = tmparray[0] | tmparray[1] | tmparray[2] | tmparray[3]; + + /* now process the remaining integers */ + for (k *= 4; k < length; k++) + acc |= in[k] - (k == 0 ? initvalue : in[k - 1]); + + /* return the number of bits */ + return bits(acc); +} diff --git a/src/simdfor.c b/src/simdfor.c new file mode 100644 index 000000000..67ddaa6b5 --- /dev/null +++ b/src/simdfor.c @@ -0,0 +1,14501 @@ +/** + * This code is released under a BSD License. + */ + +#include "simdfor.h" + + + +static __m128i iunpackFOR0(__m128i initOffset, const __m128i * _in , uint32_t * _out) { + __m128i *out = (__m128i*)(_out); + int i; + (void) _in; + for (i = 0; i < 8; ++i) { + _mm_store_si128(out++, initOffset); + _mm_store_si128(out++, initOffset); + _mm_store_si128(out++, initOffset); + _mm_store_si128(out++, initOffset); + } + + return initOffset; +} + + + + +static void ipackFOR0(__m128i initOffset , const uint32_t * _in , __m128i * out ) { + (void) initOffset; + (void) _in; + (void) out; +} + + +static void ipackFOR1(__m128i initOffset, const uint32_t * _in, __m128i * out) { + const __m128i *in = (const __m128i*)(_in); + __m128i OutReg; + + + __m128i CurrIn = _mm_load_si128(in); + __m128i InReg = _mm_sub_epi32(CurrIn, initOffset); + OutReg = InReg; + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 1)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 3)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 5)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 7)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 9)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 11)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 13)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 15)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 17)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 19)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 21)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 23)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 25)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 27)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 29)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 31)); + _mm_store_si128(out, OutReg); + + +} + + + +static void ipackFOR2(__m128i initOffset, const uint32_t * _in, __m128i * out) { + const __m128i *in = (const __m128i*)(_in); + __m128i OutReg; + + + __m128i CurrIn = _mm_load_si128(in); + __m128i InReg = _mm_sub_epi32(CurrIn, initOffset); + OutReg = InReg; + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + _mm_store_si128(out, OutReg); + + ++out; + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = InReg; + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + _mm_store_si128(out, OutReg); + + +} + + + +static void ipackFOR3(__m128i initOffset, const uint32_t * _in, __m128i * out) { + const __m128i *in = (const __m128i*)(_in); + __m128i OutReg; + + + __m128i CurrIn = _mm_load_si128(in); + __m128i InReg = _mm_sub_epi32(CurrIn, initOffset); + OutReg = InReg; + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 3)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 9)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 15)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 21)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 27)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 3 - 1); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 1)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 7)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 13)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 19)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 25)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 31)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 3 - 2); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 5)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 11)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 17)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 23)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 29)); + _mm_store_si128(out, OutReg); + + +} + + + +static void ipackFOR4(__m128i initOffset, const uint32_t * _in, __m128i * out) { + const __m128i *in = (const __m128i*)(_in); + __m128i OutReg; + + + __m128i CurrIn = _mm_load_si128(in); + __m128i InReg = _mm_sub_epi32(CurrIn, initOffset); + OutReg = InReg; + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_store_si128(out, OutReg); + + ++out; + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = InReg; + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_store_si128(out, OutReg); + + ++out; + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = InReg; + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_store_si128(out, OutReg); + + ++out; + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = InReg; + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_store_si128(out, OutReg); + + +} + + + +static void ipackFOR5(__m128i initOffset, const uint32_t * _in, __m128i * out) { + const __m128i *in = (const __m128i*)(_in); + __m128i OutReg; + + + __m128i CurrIn = _mm_load_si128(in); + __m128i InReg = _mm_sub_epi32(CurrIn, initOffset); + OutReg = InReg; + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 5)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 15)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 25)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 5 - 3); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 3)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 13)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 23)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 5 - 1); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 1)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 11)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 21)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 31)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 5 - 4); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 9)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 19)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 29)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 5 - 2); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 7)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 17)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 27)); + _mm_store_si128(out, OutReg); + + +} + + + +static void ipackFOR6(__m128i initOffset, const uint32_t * _in, __m128i * out) { + const __m128i *in = (const __m128i*)(_in); + __m128i OutReg; + + + __m128i CurrIn = _mm_load_si128(in); + __m128i InReg = _mm_sub_epi32(CurrIn, initOffset); + OutReg = InReg; + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 6 - 4); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 6 - 2); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + _mm_store_si128(out, OutReg); + + ++out; + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = InReg; + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 6 - 4); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 6 - 2); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + _mm_store_si128(out, OutReg); + + +} + + + +static void ipackFOR7(__m128i initOffset, const uint32_t * _in, __m128i * out) { + const __m128i *in = (const __m128i*)(_in); + __m128i OutReg; + + + __m128i CurrIn = _mm_load_si128(in); + __m128i InReg = _mm_sub_epi32(CurrIn, initOffset); + OutReg = InReg; + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 7)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 21)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 7 - 3); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 3)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 17)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 31)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 7 - 6); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 13)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 27)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 7 - 2); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 9)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 23)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 7 - 5); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 5)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 19)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 7 - 1); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 1)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 15)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 29)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 7 - 4); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 11)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 25)); + _mm_store_si128(out, OutReg); + + +} + + + +static void ipackFOR8(__m128i initOffset, const uint32_t * _in, __m128i * out) { + const __m128i *in = (const __m128i*)(_in); + __m128i OutReg; + + + __m128i CurrIn = _mm_load_si128(in); + __m128i InReg = _mm_sub_epi32(CurrIn, initOffset); + OutReg = InReg; + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_store_si128(out, OutReg); + + ++out; + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = InReg; + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_store_si128(out, OutReg); + + ++out; + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = InReg; + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_store_si128(out, OutReg); + + ++out; + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = InReg; + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_store_si128(out, OutReg); + + ++out; + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = InReg; + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_store_si128(out, OutReg); + + ++out; + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = InReg; + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_store_si128(out, OutReg); + + ++out; + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = InReg; + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_store_si128(out, OutReg); + + ++out; + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = InReg; + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_store_si128(out, OutReg); + + +} + + + +static void ipackFOR9(__m128i initOffset, const uint32_t * _in, __m128i * out) { + const __m128i *in = (const __m128i*)(_in); + __m128i OutReg; + + + __m128i CurrIn = _mm_load_si128(in); + __m128i InReg = _mm_sub_epi32(CurrIn, initOffset); + OutReg = InReg; + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 9)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 27)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 9 - 4); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 13)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 31)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 9 - 8); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 17)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 9 - 3); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 3)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 21)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 9 - 7); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 7)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 25)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 9 - 2); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 11)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 29)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 9 - 6); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 15)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 9 - 1); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 1)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 19)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 9 - 5); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 5)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 23)); + _mm_store_si128(out, OutReg); + + +} + + + +static void ipackFOR10(__m128i initOffset, const uint32_t * _in, __m128i * out) { + const __m128i *in = (const __m128i*)(_in); + __m128i OutReg; + + + __m128i CurrIn = _mm_load_si128(in); + __m128i InReg = _mm_sub_epi32(CurrIn, initOffset); + OutReg = InReg; + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 10 - 8); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 10 - 6); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 10 - 4); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 10 - 2); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + _mm_store_si128(out, OutReg); + + ++out; + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = InReg; + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 10 - 8); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 10 - 6); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 10 - 4); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 10 - 2); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + _mm_store_si128(out, OutReg); + + +} + + + +static void ipackFOR11(__m128i initOffset, const uint32_t * _in, __m128i * out) { + const __m128i *in = (const __m128i*)(_in); + __m128i OutReg; + + + __m128i CurrIn = _mm_load_si128(in); + __m128i InReg = _mm_sub_epi32(CurrIn, initOffset); + OutReg = InReg; + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 11)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 11 - 1); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 1)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 23)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 11 - 2); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 13)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 11 - 3); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 3)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 25)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 11 - 4); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 15)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 11 - 5); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 5)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 27)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 11 - 6); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 17)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 11 - 7); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 7)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 29)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 11 - 8); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 19)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 11 - 9); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 9)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 31)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 11 - 10); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 21)); + _mm_store_si128(out, OutReg); + + +} + + + +static void ipackFOR12(__m128i initOffset, const uint32_t * _in, __m128i * out) { + const __m128i *in = (const __m128i*)(_in); + __m128i OutReg; + + + __m128i CurrIn = _mm_load_si128(in); + __m128i InReg = _mm_sub_epi32(CurrIn, initOffset); + OutReg = InReg; + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 12 - 4); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 12 - 8); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + _mm_store_si128(out, OutReg); + + ++out; + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = InReg; + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 12 - 4); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 12 - 8); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + _mm_store_si128(out, OutReg); + + ++out; + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = InReg; + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 12 - 4); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 12 - 8); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + _mm_store_si128(out, OutReg); + + ++out; + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = InReg; + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 12 - 4); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 12 - 8); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + _mm_store_si128(out, OutReg); + + +} + + + +static void ipackFOR13(__m128i initOffset, const uint32_t * _in, __m128i * out) { + const __m128i *in = (const __m128i*)(_in); + __m128i OutReg; + + + __m128i CurrIn = _mm_load_si128(in); + __m128i InReg = _mm_sub_epi32(CurrIn, initOffset); + OutReg = InReg; + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 13)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 13 - 7); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 7)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 13 - 1); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 1)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 27)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 13 - 8); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 21)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 13 - 2); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 15)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 13 - 9); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 9)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 13 - 3); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 3)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 29)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 13 - 10); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 23)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 13 - 4); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 17)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 13 - 11); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 11)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 13 - 5); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 5)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 31)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 13 - 12); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 25)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 13 - 6); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 19)); + _mm_store_si128(out, OutReg); + + +} + + + +static void ipackFOR14(__m128i initOffset, const uint32_t * _in, __m128i * out) { + const __m128i *in = (const __m128i*)(_in); + __m128i OutReg; + + + __m128i CurrIn = _mm_load_si128(in); + __m128i InReg = _mm_sub_epi32(CurrIn, initOffset); + OutReg = InReg; + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 14 - 10); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 14 - 6); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 14 - 2); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 14 - 12); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 14 - 8); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 14 - 4); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + _mm_store_si128(out, OutReg); + + ++out; + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = InReg; + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 14 - 10); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 14 - 6); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 14 - 2); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 14 - 12); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 14 - 8); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 14 - 4); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + _mm_store_si128(out, OutReg); + + +} + + + +static void ipackFOR15(__m128i initOffset, const uint32_t * _in, __m128i * out) { + const __m128i *in = (const __m128i*)(_in); + __m128i OutReg; + + + __m128i CurrIn = _mm_load_si128(in); + __m128i InReg = _mm_sub_epi32(CurrIn, initOffset); + OutReg = InReg; + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 15)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 15 - 13); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 13)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 15 - 11); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 11)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 15 - 9); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 9)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 15 - 7); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 7)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 15 - 5); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 5)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 15 - 3); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 3)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 15 - 1); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 1)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 31)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 15 - 14); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 29)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 15 - 12); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 27)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 15 - 10); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 25)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 15 - 8); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 23)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 15 - 6); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 21)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 15 - 4); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 19)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 15 - 2); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 17)); + _mm_store_si128(out, OutReg); + + +} + + + +static void ipackFOR16(__m128i initOffset, const uint32_t * _in, __m128i * out) { + const __m128i *in = (const __m128i*)(_in); + __m128i OutReg; + + + __m128i CurrIn = _mm_load_si128(in); + __m128i InReg = _mm_sub_epi32(CurrIn, initOffset); + OutReg = InReg; + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_store_si128(out, OutReg); + + ++out; + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = InReg; + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_store_si128(out, OutReg); + + ++out; + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = InReg; + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_store_si128(out, OutReg); + + ++out; + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = InReg; + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_store_si128(out, OutReg); + + ++out; + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = InReg; + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_store_si128(out, OutReg); + + ++out; + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = InReg; + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_store_si128(out, OutReg); + + ++out; + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = InReg; + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_store_si128(out, OutReg); + + ++out; + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = InReg; + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_store_si128(out, OutReg); + + ++out; + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = InReg; + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_store_si128(out, OutReg); + + ++out; + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = InReg; + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_store_si128(out, OutReg); + + ++out; + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = InReg; + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_store_si128(out, OutReg); + + ++out; + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = InReg; + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_store_si128(out, OutReg); + + ++out; + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = InReg; + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_store_si128(out, OutReg); + + ++out; + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = InReg; + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_store_si128(out, OutReg); + + ++out; + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = InReg; + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_store_si128(out, OutReg); + + ++out; + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = InReg; + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_store_si128(out, OutReg); + + +} + + + +static void ipackFOR17(__m128i initOffset, const uint32_t * _in, __m128i * out) { + const __m128i *in = (const __m128i*)(_in); + __m128i OutReg; + + + __m128i CurrIn = _mm_load_si128(in); + __m128i InReg = _mm_sub_epi32(CurrIn, initOffset); + OutReg = InReg; + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 17)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 17 - 2); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 19)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 17 - 4); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 21)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 17 - 6); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 23)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 17 - 8); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 25)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 17 - 10); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 27)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 17 - 12); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 29)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 17 - 14); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 31)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 17 - 16); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 17 - 1); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 1)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 17 - 3); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 3)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 17 - 5); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 5)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 17 - 7); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 7)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 17 - 9); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 9)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 17 - 11); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 11)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 17 - 13); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 13)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 17 - 15); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 15)); + _mm_store_si128(out, OutReg); + + +} + + + +static void ipackFOR18(__m128i initOffset, const uint32_t * _in, __m128i * out) { + const __m128i *in = (const __m128i*)(_in); + __m128i OutReg; + + + __m128i CurrIn = _mm_load_si128(in); + __m128i InReg = _mm_sub_epi32(CurrIn, initOffset); + OutReg = InReg; + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 18 - 4); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 18 - 8); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 18 - 12); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 18 - 16); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 18 - 2); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 18 - 6); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 18 - 10); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 18 - 14); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + _mm_store_si128(out, OutReg); + + ++out; + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = InReg; + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 18 - 4); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 18 - 8); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 18 - 12); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 18 - 16); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 18 - 2); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 18 - 6); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 18 - 10); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 18 - 14); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + _mm_store_si128(out, OutReg); + + +} + + + +static void ipackFOR19(__m128i initOffset, const uint32_t * _in, __m128i * out) { + const __m128i *in = (const __m128i*)(_in); + __m128i OutReg; + + + __m128i CurrIn = _mm_load_si128(in); + __m128i InReg = _mm_sub_epi32(CurrIn, initOffset); + OutReg = InReg; + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 19)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 19 - 6); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 25)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 19 - 12); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 31)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 19 - 18); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 19 - 5); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 5)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 19 - 11); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 11)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 19 - 17); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 17)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 19 - 4); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 23)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 19 - 10); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 29)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 19 - 16); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 19 - 3); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 3)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 19 - 9); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 9)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 19 - 15); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 15)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 19 - 2); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 21)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 19 - 8); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 27)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 19 - 14); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 19 - 1); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 1)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 19 - 7); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 7)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 19 - 13); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 13)); + _mm_store_si128(out, OutReg); + + +} + + + +static void ipackFOR20(__m128i initOffset, const uint32_t * _in, __m128i * out) { + const __m128i *in = (const __m128i*)(_in); + __m128i OutReg; + + + __m128i CurrIn = _mm_load_si128(in); + __m128i InReg = _mm_sub_epi32(CurrIn, initOffset); + OutReg = InReg; + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 20 - 8); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 20 - 16); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 20 - 4); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 20 - 12); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + _mm_store_si128(out, OutReg); + + ++out; + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = InReg; + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 20 - 8); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 20 - 16); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 20 - 4); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 20 - 12); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + _mm_store_si128(out, OutReg); + + ++out; + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = InReg; + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 20 - 8); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 20 - 16); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 20 - 4); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 20 - 12); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + _mm_store_si128(out, OutReg); + + ++out; + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = InReg; + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 20 - 8); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 20 - 16); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 20 - 4); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 20 - 12); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + _mm_store_si128(out, OutReg); + + +} + + + +static void ipackFOR21(__m128i initOffset, const uint32_t * _in, __m128i * out) { + const __m128i *in = (const __m128i*)(_in); + __m128i OutReg; + + + __m128i CurrIn = _mm_load_si128(in); + __m128i InReg = _mm_sub_epi32(CurrIn, initOffset); + OutReg = InReg; + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 21)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 21 - 10); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 31)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 21 - 20); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 21 - 9); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 9)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 21 - 19); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 19)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 21 - 8); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 29)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 21 - 18); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 21 - 7); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 7)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 21 - 17); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 17)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 21 - 6); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 27)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 21 - 16); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 21 - 5); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 5)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 21 - 15); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 15)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 21 - 4); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 25)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 21 - 14); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 21 - 3); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 3)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 21 - 13); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 13)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 21 - 2); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 23)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 21 - 12); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 21 - 1); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 1)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 21 - 11); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 11)); + _mm_store_si128(out, OutReg); + + +} + + + +static void ipackFOR22(__m128i initOffset, const uint32_t * _in, __m128i * out) { + const __m128i *in = (const __m128i*)(_in); + __m128i OutReg; + + + __m128i CurrIn = _mm_load_si128(in); + __m128i InReg = _mm_sub_epi32(CurrIn, initOffset); + OutReg = InReg; + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 22 - 12); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 22 - 2); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 22 - 14); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 22 - 4); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 22 - 16); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 22 - 6); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 22 - 18); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 22 - 8); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 22 - 20); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 22 - 10); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + _mm_store_si128(out, OutReg); + + ++out; + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = InReg; + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 22 - 12); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 22 - 2); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 22 - 14); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 22 - 4); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 22 - 16); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 22 - 6); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 22 - 18); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 22 - 8); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 22 - 20); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 22 - 10); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + _mm_store_si128(out, OutReg); + + +} + + + +static void ipackFOR23(__m128i initOffset, const uint32_t * _in, __m128i * out) { + const __m128i *in = (const __m128i*)(_in); + __m128i OutReg; + + + __m128i CurrIn = _mm_load_si128(in); + __m128i InReg = _mm_sub_epi32(CurrIn, initOffset); + OutReg = InReg; + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 23)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 14); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 5); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 5)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 19); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 19)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 10); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 1); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 1)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 15); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 15)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 6); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 29)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 20); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 11); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 11)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 2); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 25)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 16); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 7); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 7)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 21); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 21)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 12); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 3); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 3)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 17); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 17)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 8); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 31)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 22); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 13); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 13)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 4); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 27)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 18); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 9); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 9)); + _mm_store_si128(out, OutReg); + + +} + + + +static void ipackFOR24(__m128i initOffset, const uint32_t * _in, __m128i * out) { + const __m128i *in = (const __m128i*)(_in); + __m128i OutReg; + + + __m128i CurrIn = _mm_load_si128(in); + __m128i InReg = _mm_sub_epi32(CurrIn, initOffset); + OutReg = InReg; + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 24 - 16); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 24 - 8); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + _mm_store_si128(out, OutReg); + + ++out; + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = InReg; + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 24 - 16); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 24 - 8); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + _mm_store_si128(out, OutReg); + + ++out; + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = InReg; + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 24 - 16); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 24 - 8); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + _mm_store_si128(out, OutReg); + + ++out; + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = InReg; + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 24 - 16); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 24 - 8); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + _mm_store_si128(out, OutReg); + + ++out; + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = InReg; + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 24 - 16); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 24 - 8); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + _mm_store_si128(out, OutReg); + + ++out; + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = InReg; + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 24 - 16); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 24 - 8); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + _mm_store_si128(out, OutReg); + + ++out; + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = InReg; + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 24 - 16); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 24 - 8); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + _mm_store_si128(out, OutReg); + + ++out; + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = InReg; + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 24 - 16); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 24 - 8); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + _mm_store_si128(out, OutReg); + + +} + + + +static void ipackFOR25(__m128i initOffset, const uint32_t * _in, __m128i * out) { + const __m128i *in = (const __m128i*)(_in); + __m128i OutReg; + + + __m128i CurrIn = _mm_load_si128(in); + __m128i InReg = _mm_sub_epi32(CurrIn, initOffset); + OutReg = InReg; + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 25)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 18); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 11); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 11)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 4); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 29)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 22); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 15); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 15)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 8); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 1); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 1)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 19); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 19)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 12); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 5); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 5)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 23); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 23)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 16); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 9); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 9)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 2); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 27)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 20); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 13); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 13)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 6); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 31)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 24); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 17); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 17)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 10); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 3); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 3)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 21); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 21)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 14); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 7); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 7)); + _mm_store_si128(out, OutReg); + + +} + + + +static void ipackFOR26(__m128i initOffset, const uint32_t * _in, __m128i * out) { + const __m128i *in = (const __m128i*)(_in); + __m128i OutReg; + + + __m128i CurrIn = _mm_load_si128(in); + __m128i InReg = _mm_sub_epi32(CurrIn, initOffset); + OutReg = InReg; + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 20); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 14); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 8); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 2); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 22); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 16); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 10); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 4); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 24); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 18); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 12); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 6); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + _mm_store_si128(out, OutReg); + + ++out; + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = InReg; + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 20); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 14); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 8); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 2); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 22); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 16); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 10); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 4); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 24); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 18); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 12); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 6); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + _mm_store_si128(out, OutReg); + + +} + + + +static void ipackFOR27(__m128i initOffset, const uint32_t * _in, __m128i * out) { + const __m128i *in = (const __m128i*)(_in); + __m128i OutReg; + + + __m128i CurrIn = _mm_load_si128(in); + __m128i InReg = _mm_sub_epi32(CurrIn, initOffset); + OutReg = InReg; + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 27)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 22); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 17); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 17)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 12); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 7); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 7)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 2); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 29)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 24); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 19); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 19)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 14); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 9); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 9)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 4); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 31)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 26); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 21); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 21)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 16); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 11); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 11)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 6); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 1); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 1)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 23); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 23)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 18); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 13); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 13)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 8); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 3); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 3)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 25); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 25)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 20); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 15); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 15)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 10); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 5); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 5)); + _mm_store_si128(out, OutReg); + + +} + + + +static void ipackFOR28(__m128i initOffset, const uint32_t * _in, __m128i * out) { + const __m128i *in = (const __m128i*)(_in); + __m128i OutReg; + + + __m128i CurrIn = _mm_load_si128(in); + __m128i InReg = _mm_sub_epi32(CurrIn, initOffset); + OutReg = InReg; + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 24); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 20); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 16); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 12); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 8); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 4); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + _mm_store_si128(out, OutReg); + + ++out; + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = InReg; + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 24); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 20); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 16); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 12); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 8); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 4); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + _mm_store_si128(out, OutReg); + + ++out; + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = InReg; + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 24); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 20); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 16); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 12); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 8); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 4); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + _mm_store_si128(out, OutReg); + + ++out; + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = InReg; + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 24); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 20); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 16); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 12); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 8); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 4); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + _mm_store_si128(out, OutReg); + + +} + + + +static void ipackFOR29(__m128i initOffset, const uint32_t * _in, __m128i * out) { + const __m128i *in = (const __m128i*)(_in); + __m128i OutReg; + + + __m128i CurrIn = _mm_load_si128(in); + __m128i InReg = _mm_sub_epi32(CurrIn, initOffset); + OutReg = InReg; + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 29)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 26); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 23); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 23)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 20); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 17); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 17)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 14); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 11); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 11)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 8); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 5); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 5)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 2); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 31)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 28); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 25); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 25)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 22); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 19); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 19)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 16); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 13); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 13)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 10); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 7); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 7)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 4); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 1); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 1)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 27); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 27)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 24); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 21); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 21)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 18); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 15); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 15)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 12); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 9); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 9)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 6); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 3); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 3)); + _mm_store_si128(out, OutReg); + + +} + + + +static void ipackFOR30(__m128i initOffset, const uint32_t * _in, __m128i * out) { + const __m128i *in = (const __m128i*)(_in); + __m128i OutReg; + + + __m128i CurrIn = _mm_load_si128(in); + __m128i InReg = _mm_sub_epi32(CurrIn, initOffset); + OutReg = InReg; + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 28); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 26); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 24); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 22); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 20); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 18); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 16); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 14); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 12); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 10); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 8); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 6); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 4); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 2); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + _mm_store_si128(out, OutReg); + + ++out; + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = InReg; + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 28); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 26); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 24); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 22); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 20); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 18); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 16); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 14); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 12); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 10); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 8); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 6); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 4); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 2); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + _mm_store_si128(out, OutReg); + + +} + + + +static void ipackFOR31(__m128i initOffset, const uint32_t * _in, __m128i * out) { + const __m128i *in = (const __m128i*)(_in); + __m128i OutReg; + + + __m128i CurrIn = _mm_load_si128(in); + __m128i InReg = _mm_sub_epi32(CurrIn, initOffset); + OutReg = InReg; + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 31)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 30); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 29); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 29)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 28); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 27); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 27)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 26); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 25); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 25)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 24); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 23); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 23)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 22); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 21); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 21)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 20); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 19); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 19)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 18); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 17); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 17)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 16); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 15); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 15)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 14); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 13); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 13)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 12); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 11); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 11)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 10); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 9); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 9)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 8); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 7); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 7)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 6); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 5); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 5)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 4); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 3); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 3)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 2); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 1); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 1)); + _mm_store_si128(out, OutReg); + + +} + + + +static void ipackFOR32(__m128i initOffset, const uint32_t * _in, __m128i * out) { + const __m128i *in = (const __m128i*)(_in); + __m128i OutReg; + + __m128i InReg = _mm_load_si128(in); + (void) initOffset; + + OutReg = InReg; + _mm_store_si128(out, OutReg); + + ++out; + ++in; + InReg = _mm_load_si128(in); + + OutReg = InReg; + _mm_store_si128(out, OutReg); + + ++out; + ++in; + InReg = _mm_load_si128(in); + + OutReg = InReg; + _mm_store_si128(out, OutReg); + + ++out; + ++in; + InReg = _mm_load_si128(in); + + OutReg = InReg; + _mm_store_si128(out, OutReg); + + ++out; + ++in; + InReg = _mm_load_si128(in); + + OutReg = InReg; + _mm_store_si128(out, OutReg); + + ++out; + ++in; + InReg = _mm_load_si128(in); + + OutReg = InReg; + _mm_store_si128(out, OutReg); + + ++out; + ++in; + InReg = _mm_load_si128(in); + + OutReg = InReg; + _mm_store_si128(out, OutReg); + + ++out; + ++in; + InReg = _mm_load_si128(in); + + OutReg = InReg; + _mm_store_si128(out, OutReg); + + ++out; + ++in; + InReg = _mm_load_si128(in); + + OutReg = InReg; + _mm_store_si128(out, OutReg); + + ++out; + ++in; + InReg = _mm_load_si128(in); + + OutReg = InReg; + _mm_store_si128(out, OutReg); + + ++out; + ++in; + InReg = _mm_load_si128(in); + + OutReg = InReg; + _mm_store_si128(out, OutReg); + + ++out; + ++in; + InReg = _mm_load_si128(in); + + OutReg = InReg; + _mm_store_si128(out, OutReg); + + ++out; + ++in; + InReg = _mm_load_si128(in); + + OutReg = InReg; + _mm_store_si128(out, OutReg); + + ++out; + ++in; + InReg = _mm_load_si128(in); + + OutReg = InReg; + _mm_store_si128(out, OutReg); + + ++out; + ++in; + InReg = _mm_load_si128(in); + + OutReg = InReg; + _mm_store_si128(out, OutReg); + + ++out; + ++in; + InReg = _mm_load_si128(in); + + OutReg = InReg; + _mm_store_si128(out, OutReg); + + ++out; + ++in; + InReg = _mm_load_si128(in); + + OutReg = InReg; + _mm_store_si128(out, OutReg); + + ++out; + ++in; + InReg = _mm_load_si128(in); + + OutReg = InReg; + _mm_store_si128(out, OutReg); + + ++out; + ++in; + InReg = _mm_load_si128(in); + + OutReg = InReg; + _mm_store_si128(out, OutReg); + + ++out; + ++in; + InReg = _mm_load_si128(in); + + OutReg = InReg; + _mm_store_si128(out, OutReg); + + ++out; + ++in; + InReg = _mm_load_si128(in); + + OutReg = InReg; + _mm_store_si128(out, OutReg); + + ++out; + ++in; + InReg = _mm_load_si128(in); + + OutReg = InReg; + _mm_store_si128(out, OutReg); + + ++out; + ++in; + InReg = _mm_load_si128(in); + + OutReg = InReg; + _mm_store_si128(out, OutReg); + + ++out; + ++in; + InReg = _mm_load_si128(in); + + OutReg = InReg; + _mm_store_si128(out, OutReg); + + ++out; + ++in; + InReg = _mm_load_si128(in); + + OutReg = InReg; + _mm_store_si128(out, OutReg); + + ++out; + ++in; + InReg = _mm_load_si128(in); + + OutReg = InReg; + _mm_store_si128(out, OutReg); + + ++out; + ++in; + InReg = _mm_load_si128(in); + + OutReg = InReg; + _mm_store_si128(out, OutReg); + + ++out; + ++in; + InReg = _mm_load_si128(in); + + OutReg = InReg; + _mm_store_si128(out, OutReg); + + ++out; + ++in; + InReg = _mm_load_si128(in); + + OutReg = InReg; + _mm_store_si128(out, OutReg); + + ++out; + ++in; + InReg = _mm_load_si128(in); + + OutReg = InReg; + _mm_store_si128(out, OutReg); + + ++out; + ++in; + InReg = _mm_load_si128(in); + + OutReg = InReg; + _mm_store_si128(out, OutReg); + + ++out; + ++in; + InReg = _mm_load_si128(in); + + OutReg = InReg; + _mm_store_si128(out, OutReg); + + +} + + + + +static __m128i iunpackFOR1(__m128i initOffset, const __m128i* in, uint32_t * _out) { + + __m128i* out = (__m128i*)(_out); + __m128i InReg = _mm_load_si128(in); + __m128i OutReg; + __m128i tmp; + const __m128i mask = _mm_set1_epi32((1U<<1)-1); + + + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,1); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,2); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,3); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,4); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,5); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,6); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,7); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,8); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,9); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,10); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,11); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,12); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,13); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,14); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,15); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,16); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,17); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,18); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,19); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,20); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,21); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,22); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,23); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,24); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,25); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,26); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,27); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,28); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,29); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,30); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,31); + OutReg = tmp; + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + + return initOffset; + +} + + + + +static __m128i iunpackFOR2(__m128i initOffset, const __m128i* in, uint32_t * _out) { + + __m128i* out = (__m128i*)(_out); + __m128i InReg = _mm_load_si128(in); + __m128i OutReg; + __m128i tmp; + const __m128i mask = _mm_set1_epi32((1U<<2)-1); + + + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,2); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,4); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,6); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,8); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,10); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,12); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,14); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,16); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,18); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,20); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,22); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,24); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,26); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,28); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,30); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,2); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,4); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,6); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,8); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,10); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,12); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,14); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,16); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,18); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,20); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,22); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,24); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,26); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,28); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,30); + OutReg = tmp; + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + + return initOffset; + +} + + + + +static __m128i iunpackFOR3(__m128i initOffset, const __m128i* in, uint32_t * _out) { + + __m128i* out = (__m128i*)(_out); + __m128i InReg = _mm_load_si128(in); + __m128i OutReg; + __m128i tmp; + const __m128i mask = _mm_set1_epi32((1U<<3)-1); + + + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,3); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,6); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,9); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,12); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,15); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,18); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,21); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,24); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,27); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,30); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 3-1), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,1); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,4); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,7); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,10); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,13); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,16); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,19); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,22); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,25); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,28); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,31); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 3-2), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,2); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,5); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,8); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,11); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,14); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,17); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,20); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,23); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,26); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,29); + OutReg = tmp; + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + + return initOffset; + +} + + + + +static __m128i iunpackFOR4(__m128i initOffset, const __m128i* in, uint32_t * _out) { + + __m128i* out = (__m128i*)(_out); + __m128i InReg = _mm_load_si128(in); + __m128i OutReg; + __m128i tmp; + const __m128i mask = _mm_set1_epi32((1U<<4)-1); + + + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,4); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,8); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,12); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,16); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,20); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,24); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,28); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,4); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,8); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,12); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,16); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,20); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,24); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,28); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,4); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,8); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,12); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,16); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,20); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,24); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,28); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,4); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,8); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,12); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,16); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,20); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,24); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,28); + OutReg = tmp; + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + + return initOffset; + +} + + + + +static __m128i iunpackFOR5(__m128i initOffset, const __m128i* in, uint32_t * _out) { + + __m128i* out = (__m128i*)(_out); + __m128i InReg = _mm_load_si128(in); + __m128i OutReg; + __m128i tmp; + const __m128i mask = _mm_set1_epi32((1U<<5)-1); + + + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,5); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,10); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,15); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,20); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,25); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,30); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 5-3), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,3); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,8); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,13); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,18); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,23); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,28); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 5-1), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,1); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,6); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,11); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,16); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,21); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,26); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,31); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 5-4), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,4); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,9); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,14); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,19); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,24); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,29); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 5-2), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,2); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,7); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,12); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,17); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,22); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,27); + OutReg = tmp; + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + + return initOffset; + +} + + + + +static __m128i iunpackFOR6(__m128i initOffset, const __m128i* in, uint32_t * _out) { + + __m128i* out = (__m128i*)(_out); + __m128i InReg = _mm_load_si128(in); + __m128i OutReg; + __m128i tmp; + const __m128i mask = _mm_set1_epi32((1U<<6)-1); + + + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,6); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,12); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,18); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,24); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,30); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 6-4), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,4); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,10); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,16); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,22); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,28); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 6-2), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,2); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,8); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,14); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,20); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,26); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,6); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,12); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,18); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,24); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,30); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 6-4), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,4); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,10); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,16); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,22); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,28); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 6-2), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,2); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,8); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,14); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,20); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,26); + OutReg = tmp; + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + + return initOffset; + +} + + + + +static __m128i iunpackFOR7(__m128i initOffset, const __m128i* in, uint32_t * _out) { + + __m128i* out = (__m128i*)(_out); + __m128i InReg = _mm_load_si128(in); + __m128i OutReg; + __m128i tmp; + const __m128i mask = _mm_set1_epi32((1U<<7)-1); + + + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,7); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,14); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,21); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,28); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 7-3), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,3); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,10); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,17); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,24); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,31); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 7-6), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,6); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,13); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,20); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,27); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 7-2), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,2); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,9); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,16); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,23); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,30); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 7-5), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,5); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,12); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,19); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,26); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 7-1), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,1); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,8); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,15); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,22); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,29); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 7-4), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,4); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,11); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,18); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,25); + OutReg = tmp; + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + + return initOffset; + +} + + + + +static __m128i iunpackFOR8(__m128i initOffset, const __m128i* in, uint32_t * _out) { + + __m128i* out = (__m128i*)(_out); + __m128i InReg = _mm_load_si128(in); + __m128i OutReg; + __m128i tmp; + const __m128i mask = _mm_set1_epi32((1U<<8)-1); + + + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,8); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,16); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,24); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,8); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,16); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,24); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,8); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,16); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,24); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,8); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,16); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,24); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,8); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,16); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,24); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,8); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,16); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,24); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,8); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,16); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,24); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,8); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,16); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,24); + OutReg = tmp; + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + + return initOffset; + +} + + + + +static __m128i iunpackFOR9(__m128i initOffset, const __m128i* in, uint32_t * _out) { + + __m128i* out = (__m128i*)(_out); + __m128i InReg = _mm_load_si128(in); + __m128i OutReg; + __m128i tmp; + const __m128i mask = _mm_set1_epi32((1U<<9)-1); + + + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,9); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,18); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,27); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 9-4), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,4); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,13); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,22); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,31); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 9-8), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,8); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,17); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,26); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 9-3), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,3); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,12); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,21); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,30); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 9-7), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,7); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,16); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,25); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 9-2), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,2); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,11); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,20); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,29); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 9-6), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,6); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,15); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,24); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 9-1), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,1); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,10); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,19); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,28); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 9-5), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,5); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,14); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,23); + OutReg = tmp; + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + + return initOffset; + +} + + + + +static __m128i iunpackFOR10(__m128i initOffset, const __m128i* in, uint32_t * _out) { + + __m128i* out = (__m128i*)(_out); + __m128i InReg = _mm_load_si128(in); + __m128i OutReg; + __m128i tmp; + const __m128i mask = _mm_set1_epi32((1U<<10)-1); + + + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,10); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,20); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,30); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 10-8), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,8); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,18); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,28); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 10-6), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,6); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,16); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,26); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 10-4), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,4); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,14); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,24); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 10-2), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,2); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,12); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,22); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,10); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,20); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,30); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 10-8), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,8); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,18); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,28); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 10-6), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,6); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,16); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,26); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 10-4), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,4); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,14); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,24); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 10-2), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,2); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,12); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,22); + OutReg = tmp; + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + + return initOffset; + +} + + + + +static __m128i iunpackFOR11(__m128i initOffset, const __m128i* in, uint32_t * _out) { + + __m128i* out = (__m128i*)(_out); + __m128i InReg = _mm_load_si128(in); + __m128i OutReg; + __m128i tmp; + const __m128i mask = _mm_set1_epi32((1U<<11)-1); + + + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,11); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,22); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 11-1), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,1); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,12); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,23); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 11-2), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,2); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,13); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,24); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 11-3), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,3); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,14); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,25); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 11-4), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,4); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,15); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,26); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 11-5), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,5); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,16); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,27); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 11-6), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,6); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,17); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,28); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 11-7), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,7); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,18); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,29); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 11-8), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,8); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,19); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,30); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 11-9), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,9); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,20); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,31); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 11-10), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,10); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,21); + OutReg = tmp; + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + + return initOffset; + +} + + + + +static __m128i iunpackFOR12(__m128i initOffset, const __m128i* in, uint32_t * _out) { + + __m128i* out = (__m128i*)(_out); + __m128i InReg = _mm_load_si128(in); + __m128i OutReg; + __m128i tmp; + const __m128i mask = _mm_set1_epi32((1U<<12)-1); + + + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,12); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,24); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 12-4), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,4); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,16); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,28); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 12-8), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,8); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,20); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,12); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,24); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 12-4), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,4); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,16); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,28); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 12-8), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,8); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,20); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,12); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,24); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 12-4), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,4); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,16); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,28); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 12-8), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,8); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,20); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,12); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,24); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 12-4), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,4); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,16); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,28); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 12-8), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,8); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,20); + OutReg = tmp; + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + + return initOffset; + +} + + + + +static __m128i iunpackFOR13(__m128i initOffset, const __m128i* in, uint32_t * _out) { + + __m128i* out = (__m128i*)(_out); + __m128i InReg = _mm_load_si128(in); + __m128i OutReg; + __m128i tmp; + const __m128i mask = _mm_set1_epi32((1U<<13)-1); + + + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,13); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,26); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 13-7), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,7); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,20); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 13-1), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,1); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,14); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,27); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 13-8), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,8); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,21); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 13-2), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,2); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,15); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,28); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 13-9), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,9); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,22); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 13-3), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,3); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,16); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,29); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 13-10), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,10); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,23); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 13-4), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,4); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,17); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,30); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 13-11), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,11); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,24); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 13-5), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,5); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,18); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,31); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 13-12), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,12); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,25); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 13-6), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,6); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,19); + OutReg = tmp; + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + + return initOffset; + +} + + + + +static __m128i iunpackFOR14(__m128i initOffset, const __m128i* in, uint32_t * _out) { + + __m128i* out = (__m128i*)(_out); + __m128i InReg = _mm_load_si128(in); + __m128i OutReg; + __m128i tmp; + const __m128i mask = _mm_set1_epi32((1U<<14)-1); + + + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,14); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,28); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 14-10), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,10); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,24); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 14-6), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,6); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,20); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 14-2), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,2); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,16); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,30); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 14-12), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,12); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,26); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 14-8), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,8); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,22); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 14-4), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,4); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,18); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,14); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,28); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 14-10), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,10); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,24); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 14-6), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,6); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,20); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 14-2), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,2); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,16); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,30); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 14-12), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,12); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,26); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 14-8), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,8); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,22); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 14-4), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,4); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,18); + OutReg = tmp; + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + + return initOffset; + +} + + + + +static __m128i iunpackFOR15(__m128i initOffset, const __m128i* in, uint32_t * _out) { + + __m128i* out = (__m128i*)(_out); + __m128i InReg = _mm_load_si128(in); + __m128i OutReg; + __m128i tmp; + const __m128i mask = _mm_set1_epi32((1U<<15)-1); + + + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,15); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,30); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 15-13), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,13); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,28); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 15-11), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,11); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,26); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 15-9), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,9); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,24); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 15-7), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,7); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,22); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 15-5), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,5); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,20); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 15-3), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,3); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,18); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 15-1), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,1); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,16); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,31); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 15-14), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,14); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,29); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 15-12), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,12); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,27); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 15-10), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,10); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,25); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 15-8), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,8); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,23); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 15-6), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,6); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,21); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 15-4), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,4); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,19); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 15-2), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,2); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,17); + OutReg = tmp; + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + + return initOffset; + +} + + + + +static __m128i iunpackFOR16(__m128i initOffset, const __m128i* in, uint32_t * _out) { + + __m128i* out = (__m128i*)(_out); + __m128i InReg = _mm_load_si128(in); + __m128i OutReg; + __m128i tmp; + const __m128i mask = _mm_set1_epi32((1U<<16)-1); + + + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,16); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,16); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,16); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,16); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,16); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,16); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,16); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,16); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,16); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,16); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,16); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,16); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,16); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,16); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,16); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,16); + OutReg = tmp; + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + + return initOffset; + +} + + + + +static __m128i iunpackFOR17(__m128i initOffset, const __m128i* in, uint32_t * _out) { + + __m128i* out = (__m128i*)(_out); + __m128i InReg = _mm_load_si128(in); + __m128i OutReg; + __m128i tmp; + const __m128i mask = _mm_set1_epi32((1U<<17)-1); + + + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,17); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 17-2), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,2); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,19); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 17-4), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,4); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,21); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 17-6), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,6); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,23); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 17-8), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,8); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,25); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 17-10), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,10); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,27); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 17-12), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,12); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,29); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 17-14), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,14); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,31); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 17-16), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,16); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 17-1), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,1); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,18); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 17-3), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,3); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,20); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 17-5), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,5); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,22); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 17-7), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,7); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,24); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 17-9), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,9); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,26); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 17-11), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,11); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,28); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 17-13), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,13); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,30); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 17-15), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,15); + OutReg = tmp; + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + + return initOffset; + +} + + + + +static __m128i iunpackFOR18(__m128i initOffset, const __m128i* in, uint32_t * _out) { + + __m128i* out = (__m128i*)(_out); + __m128i InReg = _mm_load_si128(in); + __m128i OutReg; + __m128i tmp; + const __m128i mask = _mm_set1_epi32((1U<<18)-1); + + + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,18); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 18-4), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,4); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,22); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 18-8), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,8); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,26); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 18-12), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,12); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,30); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 18-16), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,16); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 18-2), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,2); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,20); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 18-6), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,6); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,24); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 18-10), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,10); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,28); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 18-14), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,14); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,18); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 18-4), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,4); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,22); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 18-8), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,8); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,26); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 18-12), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,12); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,30); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 18-16), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,16); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 18-2), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,2); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,20); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 18-6), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,6); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,24); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 18-10), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,10); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,28); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 18-14), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,14); + OutReg = tmp; + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + + return initOffset; + +} + + + + +static __m128i iunpackFOR19(__m128i initOffset, const __m128i* in, uint32_t * _out) { + + __m128i* out = (__m128i*)(_out); + __m128i InReg = _mm_load_si128(in); + __m128i OutReg; + __m128i tmp; + const __m128i mask = _mm_set1_epi32((1U<<19)-1); + + + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,19); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 19-6), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,6); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,25); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 19-12), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,12); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,31); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 19-18), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,18); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 19-5), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,5); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,24); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 19-11), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,11); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,30); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 19-17), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,17); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 19-4), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,4); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,23); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 19-10), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,10); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,29); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 19-16), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,16); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 19-3), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,3); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,22); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 19-9), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,9); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,28); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 19-15), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,15); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 19-2), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,2); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,21); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 19-8), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,8); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,27); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 19-14), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,14); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 19-1), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,1); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,20); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 19-7), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,7); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,26); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 19-13), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,13); + OutReg = tmp; + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + + return initOffset; + +} + + + + +static __m128i iunpackFOR20(__m128i initOffset, const __m128i* in, uint32_t * _out) { + + __m128i* out = (__m128i*)(_out); + __m128i InReg = _mm_load_si128(in); + __m128i OutReg; + __m128i tmp; + const __m128i mask = _mm_set1_epi32((1U<<20)-1); + + + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,20); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 20-8), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,8); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,28); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 20-16), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,16); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 20-4), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,4); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,24); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 20-12), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,12); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,20); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 20-8), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,8); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,28); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 20-16), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,16); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 20-4), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,4); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,24); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 20-12), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,12); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,20); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 20-8), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,8); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,28); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 20-16), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,16); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 20-4), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,4); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,24); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 20-12), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,12); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,20); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 20-8), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,8); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,28); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 20-16), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,16); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 20-4), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,4); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,24); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 20-12), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,12); + OutReg = tmp; + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + + return initOffset; + +} + + + + +static __m128i iunpackFOR21(__m128i initOffset, const __m128i* in, uint32_t * _out) { + + __m128i* out = (__m128i*)(_out); + __m128i InReg = _mm_load_si128(in); + __m128i OutReg; + __m128i tmp; + const __m128i mask = _mm_set1_epi32((1U<<21)-1); + + + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,21); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 21-10), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,10); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,31); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 21-20), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,20); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 21-9), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,9); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,30); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 21-19), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,19); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 21-8), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,8); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,29); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 21-18), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,18); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 21-7), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,7); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,28); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 21-17), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,17); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 21-6), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,6); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,27); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 21-16), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,16); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 21-5), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,5); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,26); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 21-15), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,15); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 21-4), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,4); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,25); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 21-14), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,14); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 21-3), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,3); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,24); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 21-13), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,13); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 21-2), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,2); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,23); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 21-12), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,12); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 21-1), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,1); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,22); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 21-11), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,11); + OutReg = tmp; + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + + return initOffset; + +} + + + + +static __m128i iunpackFOR22(__m128i initOffset, const __m128i* in, uint32_t * _out) { + + __m128i* out = (__m128i*)(_out); + __m128i InReg = _mm_load_si128(in); + __m128i OutReg; + __m128i tmp; + const __m128i mask = _mm_set1_epi32((1U<<22)-1); + + + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,22); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 22-12), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,12); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 22-2), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,2); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,24); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 22-14), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,14); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 22-4), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,4); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,26); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 22-16), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,16); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 22-6), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,6); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,28); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 22-18), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,18); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 22-8), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,8); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,30); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 22-20), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,20); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 22-10), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,10); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,22); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 22-12), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,12); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 22-2), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,2); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,24); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 22-14), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,14); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 22-4), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,4); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,26); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 22-16), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,16); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 22-6), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,6); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,28); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 22-18), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,18); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 22-8), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,8); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,30); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 22-20), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,20); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 22-10), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,10); + OutReg = tmp; + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + + return initOffset; + +} + + + + +static __m128i iunpackFOR23(__m128i initOffset, const __m128i* in, uint32_t * _out) { + + __m128i* out = (__m128i*)(_out); + __m128i InReg = _mm_load_si128(in); + __m128i OutReg; + __m128i tmp; + const __m128i mask = _mm_set1_epi32((1U<<23)-1); + + + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,23); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 23-14), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,14); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 23-5), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,5); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,28); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 23-19), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,19); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 23-10), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,10); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 23-1), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,1); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,24); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 23-15), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,15); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 23-6), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,6); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,29); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 23-20), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,20); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 23-11), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,11); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 23-2), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,2); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,25); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 23-16), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,16); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 23-7), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,7); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,30); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 23-21), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,21); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 23-12), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,12); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 23-3), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,3); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,26); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 23-17), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,17); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 23-8), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,8); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,31); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 23-22), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,22); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 23-13), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,13); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 23-4), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,4); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,27); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 23-18), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,18); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 23-9), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,9); + OutReg = tmp; + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + + return initOffset; + +} + + + + +static __m128i iunpackFOR24(__m128i initOffset, const __m128i* in, uint32_t * _out) { + + __m128i* out = (__m128i*)(_out); + __m128i InReg = _mm_load_si128(in); + __m128i OutReg; + __m128i tmp; + const __m128i mask = _mm_set1_epi32((1U<<24)-1); + + + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,24); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 24-16), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,16); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 24-8), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,8); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,24); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 24-16), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,16); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 24-8), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,8); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,24); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 24-16), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,16); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 24-8), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,8); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,24); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 24-16), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,16); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 24-8), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,8); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,24); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 24-16), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,16); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 24-8), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,8); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,24); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 24-16), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,16); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 24-8), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,8); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,24); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 24-16), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,16); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 24-8), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,8); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,24); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 24-16), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,16); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 24-8), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,8); + OutReg = tmp; + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + + return initOffset; + +} + + + + +static __m128i iunpackFOR25(__m128i initOffset, const __m128i* in, uint32_t * _out) { + + __m128i* out = (__m128i*)(_out); + __m128i InReg = _mm_load_si128(in); + __m128i OutReg; + __m128i tmp; + const __m128i mask = _mm_set1_epi32((1U<<25)-1); + + + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,25); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 25-18), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,18); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 25-11), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,11); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 25-4), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,4); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,29); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 25-22), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,22); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 25-15), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,15); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 25-8), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,8); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 25-1), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,1); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,26); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 25-19), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,19); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 25-12), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,12); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 25-5), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,5); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,30); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 25-23), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,23); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 25-16), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,16); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 25-9), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,9); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 25-2), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,2); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,27); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 25-20), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,20); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 25-13), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,13); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 25-6), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,6); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,31); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 25-24), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,24); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 25-17), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,17); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 25-10), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,10); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 25-3), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,3); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,28); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 25-21), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,21); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 25-14), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,14); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 25-7), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,7); + OutReg = tmp; + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + + return initOffset; + +} + + + + +static __m128i iunpackFOR26(__m128i initOffset, const __m128i* in, uint32_t * _out) { + + __m128i* out = (__m128i*)(_out); + __m128i InReg = _mm_load_si128(in); + __m128i OutReg; + __m128i tmp; + const __m128i mask = _mm_set1_epi32((1U<<26)-1); + + + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,26); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 26-20), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,20); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 26-14), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,14); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 26-8), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,8); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 26-2), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,2); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,28); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 26-22), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,22); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 26-16), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,16); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 26-10), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,10); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 26-4), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,4); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,30); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 26-24), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,24); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 26-18), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,18); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 26-12), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,12); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 26-6), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,6); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,26); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 26-20), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,20); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 26-14), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,14); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 26-8), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,8); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 26-2), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,2); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,28); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 26-22), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,22); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 26-16), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,16); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 26-10), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,10); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 26-4), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,4); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,30); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 26-24), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,24); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 26-18), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,18); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 26-12), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,12); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 26-6), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,6); + OutReg = tmp; + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + + return initOffset; + +} + + + + +static __m128i iunpackFOR27(__m128i initOffset, const __m128i* in, uint32_t * _out) { + + __m128i* out = (__m128i*)(_out); + __m128i InReg = _mm_load_si128(in); + __m128i OutReg; + __m128i tmp; + const __m128i mask = _mm_set1_epi32((1U<<27)-1); + + + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,27); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27-22), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,22); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27-17), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,17); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27-12), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,12); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27-7), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,7); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27-2), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,2); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,29); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27-24), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,24); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27-19), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,19); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27-14), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,14); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27-9), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,9); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27-4), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,4); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,31); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27-26), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,26); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27-21), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,21); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27-16), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,16); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27-11), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,11); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27-6), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,6); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27-1), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,1); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,28); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27-23), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,23); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27-18), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,18); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27-13), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,13); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27-8), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,8); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27-3), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,3); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,30); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27-25), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,25); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27-20), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,20); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27-15), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,15); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27-10), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,10); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27-5), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,5); + OutReg = tmp; + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + + return initOffset; + +} + + + + +static __m128i iunpackFOR28(__m128i initOffset, const __m128i* in, uint32_t * _out) { + + __m128i* out = (__m128i*)(_out); + __m128i InReg = _mm_load_si128(in); + __m128i OutReg; + __m128i tmp; + const __m128i mask = _mm_set1_epi32((1U<<28)-1); + + + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,28); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 28-24), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,24); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 28-20), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,20); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 28-16), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,16); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 28-12), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,12); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 28-8), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,8); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 28-4), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,4); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,28); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 28-24), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,24); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 28-20), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,20); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 28-16), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,16); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 28-12), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,12); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 28-8), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,8); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 28-4), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,4); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,28); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 28-24), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,24); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 28-20), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,20); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 28-16), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,16); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 28-12), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,12); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 28-8), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,8); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 28-4), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,4); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,28); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 28-24), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,24); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 28-20), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,20); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 28-16), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,16); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 28-12), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,12); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 28-8), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,8); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 28-4), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,4); + OutReg = tmp; + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + + return initOffset; + +} + + + + +static __m128i iunpackFOR29(__m128i initOffset, const __m128i* in, uint32_t * _out) { + + __m128i* out = (__m128i*)(_out); + __m128i InReg = _mm_load_si128(in); + __m128i OutReg; + __m128i tmp; + const __m128i mask = _mm_set1_epi32((1U<<29)-1); + + + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,29); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29-26), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,26); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29-23), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,23); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29-20), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,20); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29-17), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,17); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29-14), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,14); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29-11), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,11); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29-8), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,8); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29-5), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,5); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29-2), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,2); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,31); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29-28), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,28); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29-25), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,25); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29-22), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,22); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29-19), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,19); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29-16), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,16); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29-13), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,13); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29-10), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,10); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29-7), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,7); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29-4), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,4); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29-1), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,1); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,30); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29-27), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,27); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29-24), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,24); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29-21), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,21); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29-18), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,18); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29-15), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,15); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29-12), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,12); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29-9), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,9); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29-6), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,6); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29-3), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,3); + OutReg = tmp; + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + + return initOffset; + +} + + + + +static __m128i iunpackFOR30(__m128i initOffset, const __m128i* in, uint32_t * _out) { + + __m128i* out = (__m128i*)(_out); + __m128i InReg = _mm_load_si128(in); + __m128i OutReg; + __m128i tmp; + const __m128i mask = _mm_set1_epi32((1U<<30)-1); + + + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,30); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30-28), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,28); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30-26), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,26); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30-24), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,24); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30-22), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,22); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30-20), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,20); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30-18), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,18); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30-16), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,16); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30-14), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,14); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30-12), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,12); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30-10), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,10); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30-8), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,8); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30-6), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,6); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30-4), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,4); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30-2), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,2); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,30); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30-28), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,28); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30-26), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,26); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30-24), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,24); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30-22), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,22); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30-20), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,20); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30-18), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,18); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30-16), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,16); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30-14), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,14); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30-12), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,12); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30-10), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,10); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30-8), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,8); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30-6), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,6); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30-4), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,4); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30-2), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,2); + OutReg = tmp; + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + + return initOffset; + +} + + + + +static __m128i iunpackFOR31(__m128i initOffset, const __m128i* in, uint32_t * _out) { + + __m128i* out = (__m128i*)(_out); + __m128i InReg = _mm_load_si128(in); + __m128i OutReg; + __m128i tmp; + const __m128i mask = _mm_set1_epi32((1U<<31)-1); + + + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,31); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31-30), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,30); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31-29), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,29); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31-28), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,28); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31-27), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,27); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31-26), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,26); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31-25), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,25); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31-24), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,24); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31-23), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,23); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31-22), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,22); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31-21), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,21); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31-20), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,20); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31-19), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,19); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31-18), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,18); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31-17), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,17); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31-16), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,16); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31-15), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,15); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31-14), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,14); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31-13), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,13); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31-12), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,12); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31-11), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,11); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31-10), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,10); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31-9), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,9); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31-8), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,8); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31-7), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,7); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31-6), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,6); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31-5), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,5); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31-4), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,4); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31-3), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,3); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31-2), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,2); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31-1), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,1); + OutReg = tmp; + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_store_si128(out++, OutReg); + + + return initOffset; + +} + + + +static __m128i iunpackFOR32(__m128i initvalue , const __m128i* in, uint32_t * _out) { + __m128i * mout = (__m128i *)_out; + __m128i invec; + size_t k; + (void) initvalue; + for(k = 0; k < 128/4; ++k) { + invec = _mm_load_si128(in++); + _mm_store_si128(mout++, invec); + } + return invec; +} + + + + + +void simdpackFOR(uint32_t initvalue, const uint32_t * in, __m128i * out, const uint32_t bit) { + __m128i initOffset = _mm_set1_epi32 (initvalue); + switch(bit) { + case 0: ipackFOR0(initOffset,in,out); break; + + case 1: ipackFOR1(initOffset,in,out); break; + + case 2: ipackFOR2(initOffset,in,out); break; + + case 3: ipackFOR3(initOffset,in,out); break; + + case 4: ipackFOR4(initOffset,in,out); break; + + case 5: ipackFOR5(initOffset,in,out); break; + + case 6: ipackFOR6(initOffset,in,out); break; + + case 7: ipackFOR7(initOffset,in,out); break; + + case 8: ipackFOR8(initOffset,in,out); break; + + case 9: ipackFOR9(initOffset,in,out); break; + + case 10: ipackFOR10(initOffset,in,out); break; + + case 11: ipackFOR11(initOffset,in,out); break; + + case 12: ipackFOR12(initOffset,in,out); break; + + case 13: ipackFOR13(initOffset,in,out); break; + + case 14: ipackFOR14(initOffset,in,out); break; + + case 15: ipackFOR15(initOffset,in,out); break; + + case 16: ipackFOR16(initOffset,in,out); break; + + case 17: ipackFOR17(initOffset,in,out); break; + + case 18: ipackFOR18(initOffset,in,out); break; + + case 19: ipackFOR19(initOffset,in,out); break; + + case 20: ipackFOR20(initOffset,in,out); break; + + case 21: ipackFOR21(initOffset,in,out); break; + + case 22: ipackFOR22(initOffset,in,out); break; + + case 23: ipackFOR23(initOffset,in,out); break; + + case 24: ipackFOR24(initOffset,in,out); break; + + case 25: ipackFOR25(initOffset,in,out); break; + + case 26: ipackFOR26(initOffset,in,out); break; + + case 27: ipackFOR27(initOffset,in,out); break; + + case 28: ipackFOR28(initOffset,in,out); break; + + case 29: ipackFOR29(initOffset,in,out); break; + + case 30: ipackFOR30(initOffset,in,out); break; + + case 31: ipackFOR31(initOffset,in,out); break; + + case 32: ipackFOR32(initOffset,in,out); break; + + default: break; + } +} + + + + +void simdunpackFOR(uint32_t initvalue, const __m128i * in, uint32_t * out, const uint32_t bit) { + __m128i initOffset = _mm_set1_epi32 (initvalue); + switch(bit) { + case 0: iunpackFOR0(initOffset, in,out); break; + + case 1: iunpackFOR1(initOffset, in,out); break; + + case 2: iunpackFOR2(initOffset, in,out); break; + + case 3: iunpackFOR3(initOffset, in,out); break; + + case 4: iunpackFOR4(initOffset, in,out); break; + + case 5: iunpackFOR5(initOffset, in,out); break; + + case 6: iunpackFOR6(initOffset, in,out); break; + + case 7: iunpackFOR7(initOffset, in,out); break; + + case 8: iunpackFOR8(initOffset, in,out); break; + + case 9: iunpackFOR9(initOffset, in,out); break; + + case 10: iunpackFOR10(initOffset, in,out); break; + + case 11: iunpackFOR11(initOffset, in,out); break; + + case 12: iunpackFOR12(initOffset, in,out); break; + + case 13: iunpackFOR13(initOffset, in,out); break; + + case 14: iunpackFOR14(initOffset, in,out); break; + + case 15: iunpackFOR15(initOffset, in,out); break; + + case 16: iunpackFOR16(initOffset, in,out); break; + + case 17: iunpackFOR17(initOffset, in,out); break; + + case 18: iunpackFOR18(initOffset, in,out); break; + + case 19: iunpackFOR19(initOffset, in,out); break; + + case 20: iunpackFOR20(initOffset, in,out); break; + + case 21: iunpackFOR21(initOffset, in,out); break; + + case 22: iunpackFOR22(initOffset, in,out); break; + + case 23: iunpackFOR23(initOffset, in,out); break; + + case 24: iunpackFOR24(initOffset, in,out); break; + + case 25: iunpackFOR25(initOffset, in,out); break; + + case 26: iunpackFOR26(initOffset, in,out); break; + + case 27: iunpackFOR27(initOffset, in,out); break; + + case 28: iunpackFOR28(initOffset, in,out); break; + + case 29: iunpackFOR29(initOffset, in,out); break; + + case 30: iunpackFOR30(initOffset, in,out); break; + + case 31: iunpackFOR31(initOffset, in,out); break; + + case 32: iunpackFOR32(initOffset, in,out); break; + + default: break; + } +} + + +uint32_t simdselectFOR(uint32_t initvalue, const __m128i *in, uint32_t bit, + int slot) { + const uint32_t * pin = (const uint32_t *) in; + if( bit == 0) { + return initvalue; + } else if (bit == 32) { + /* silly special case */ + return pin[slot]; + } else { + const int lane = slot % 4; /* we have 4 interleaved lanes */ + const int bitsinlane = (slot / 4) * bit; /* how many bits in lane */ + const int firstwordinlane = bitsinlane / 32; + const int secondwordinlane = (bitsinlane + bit - 1) / 32; + const uint32_t firstpart = pin[4 * firstwordinlane + lane] + >> (bitsinlane % 32); + const uint32_t mask = (1 << bit) - 1; + if (firstwordinlane == secondwordinlane) { + /* easy common case*/ + return initvalue + (firstpart & mask); + } else { + /* harder case where we need to combine two words */ + const uint32_t secondpart = pin[4 * firstwordinlane + 4 + lane]; + const int usablebitsinfirstword = 32 - (bitsinlane % 32); + return initvalue + + ((firstpart | (secondpart << usablebitsinfirstword)) + & mask); + } + } + +} + + + + +int simdsearchwithlengthFOR(uint32_t initvalue, const __m128i *in, uint32_t bit, + int length, uint32_t key, uint32_t *presult) { + int count = length; + int begin = 0; + uint32_t val; + while (count > 0) { + int step = count / 2; + val = simdselectFOR(initvalue, in, bit, begin + step); + if (val < key) { + begin += step + 1; + count -= step + 1; + } else count = step; + } + *presult = simdselectFOR(initvalue, in, bit, begin); + return begin; +} + +int simdpackFOR_compressedbytes(int length, const uint32_t bit) { + if(bit == 0) return 0;/* nothing to do */ + if(bit == 32) { + return length * sizeof(uint32_t); + } + return (((length + 3 )/ 4) * bit + 31 ) / 32 * sizeof(__m128i); +} + +__m128i * simdpackFOR_length(uint32_t initvalue, const uint32_t * in, int length, __m128i * out, const uint32_t bit) { + int k; + int inwordpointer; + __m128i P; + uint32_t firstpass; + __m128i offset; + if(bit == 0) return out;/* nothing to do */ + if(bit == 32) { + memcpy(out,in,length*sizeof(uint32_t)); + return (__m128i *)((uint32_t *) out + length); + } + offset = _mm_set1_epi32(initvalue); + inwordpointer = 0; + P = _mm_setzero_si128(); + for(k = 0; k < length / 4 ; ++k) { + __m128i value = _mm_sub_epi32(_mm_loadu_si128(((const __m128i * ) in + k)),offset); + P = _mm_or_si128(P,_mm_slli_epi32(value, inwordpointer)); + firstpass = sizeof(uint32_t) * 8 - inwordpointer; + if(bit +#include "simdintegratedbitpacking.h" + + +SIMDCOMP_ALIGNED(16) static int8_t shuffle_mask_bytes[256] = { + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, + 4, 5, 6, 7, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, + 8, 9, 10, 11, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, + 0, 1, 2, 3, 8, 9, 10, 11, 8, 9, 10, 11, 12, 13, 14, 15, + 4, 5, 6, 7, 8, 9, 10, 11, 8, 9, 10, 11, 12, 13, 14, 15, + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, + 12, 13, 14, 15, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, + 0, 1, 2, 3, 12, 13, 14, 15, 8, 9, 10, 11, 12, 13, 14, 15, + 4, 5, 6, 7, 12, 13, 14, 15, 8, 9, 10, 11, 12, 13, 14, 15, + 0, 1, 2, 3, 4, 5, 6, 7, 12, 13, 14, 15, 12, 13, 14, 15, + 8, 9, 10, 11, 12, 13, 14, 15, 8, 9, 10, 11, 12, 13, 14, 15, + 0, 1, 2, 3, 8, 9, 10, 11, 12, 13, 14, 15, 12, 13, 14, 15, + 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 12, 13, 14, 15, + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, + }; +static const __m128i *shuffle_mask = (__m128i *) shuffle_mask_bytes; + +/* should emulate std:lower_bound*/ +static int lower_bound(uint32_t * A, uint32_t key, int imin, int imax) +{ + int imid; + imax --; + while(imin + 1 < imax) { + imid = imin + ((imax - imin) / 2); + + if (A[imid] >= key) { + imax = imid; + } else if (A[imid] < key) { + imin = imid; + } + } + if(A[imin] >= key) return imin; + return imax; +} + + +#define PrefixSum(ret, curr, prev) do { \ + const __m128i _tmp1 = _mm_add_epi32(_mm_slli_si128(curr, 8), curr); \ + const __m128i _tmp2 = _mm_add_epi32(_mm_slli_si128(_tmp1, 4), _tmp1); \ + ret = _mm_add_epi32(_tmp2, _mm_shuffle_epi32(prev, 0xff)); \ + } while (0) + + + +/* perform a lower-bound search for |key| in |out|; the resulting uint32 +* is stored in |*presult|.*/ +#define CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult) \ + do { \ + __m128i tmpout = _mm_sub_epi32(out, conversion); \ + uint32_t mmask = _mm_movemask_ps(_mm_castsi128_ps(_mm_cmplt_epi32(tmpout, key4))); \ + if (mmask != 15) { \ + const __m128i p = _mm_shuffle_epi8(out, shuffle_mask[mmask ^ 15]); \ + int offset; \ + int remaining = length - i; \ + SIMDCOMP_CTZ(offset, mmask ^ 15); \ + *presult = _mm_cvtsi128_si32(p); \ + if (offset < remaining) \ + return (i + offset); \ + } \ + i += 4; \ + if (i >= length) { /* reached end of array? */ \ + *presult = key + 1; \ + return (length); \ + } \ + } while (0) + +static int +iunpacksearchwithlength0(__m128i initOffset , const __m128i * _in, int length, + uint32_t key, uint32_t *presult) +{ + if (length > 0) { + uint32_t repeatedvalue = (uint32_t) _mm_extract_epi32(initOffset, 3); + if (repeatedvalue >= key) { + *presult = repeatedvalue; + return 0; + } + } + (void) _in; + *presult = key + 1; + return (length); +} + +static int +iunpacksearchwithlength1(__m128i initOffset, const __m128i *in, int length, + uint32_t key, uint32_t *presult) +{ + int i = 0; + __m128i InReg = _mm_loadu_si128(in); + __m128i out; + __m128i tmp; + __m128i mask = _mm_set1_epi32((1U<<1)-1); + __m128i conversion = _mm_set1_epi32(2147483648U); + __m128i key4 = _mm_set1_epi32(key - 2147483648U); + + + tmp = InReg; + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,1); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,2); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,3); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,4); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,5); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,6); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,7); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,8); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,9); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,10); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,11); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,12); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,13); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,14); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,15); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,16); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,17); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,18); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,19); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,20); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,21); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,22); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,23); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,24); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,25); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,26); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,27); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,28); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,29); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,30); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,31); + out = tmp; + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + *presult = key + 1; + return (128); +} + +static int +iunpacksearchwithlength2(__m128i initOffset, const __m128i *in, int length, + uint32_t key, uint32_t *presult) +{ + int i = 0; + __m128i InReg = _mm_loadu_si128(in); + __m128i out; + __m128i tmp; + __m128i mask = _mm_set1_epi32((1U<<2)-1); + __m128i conversion = _mm_set1_epi32(2147483648U); + __m128i key4 = _mm_set1_epi32(key - 2147483648U); + + + tmp = InReg; + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,2); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,4); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,6); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,8); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,10); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,12); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,14); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,16); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,18); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,20); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,22); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,24); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,26); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,28); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,30); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = InReg; + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,2); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,4); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,6); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,8); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,10); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,12); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,14); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,16); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,18); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,20); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,22); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,24); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,26); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,28); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,30); + out = tmp; + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + *presult = key + 1; + return (128); +} + +static int +iunpacksearchwithlength3(__m128i initOffset, const __m128i *in, int length, + uint32_t key, uint32_t *presult) +{ + int i = 0; + __m128i InReg = _mm_loadu_si128(in); + __m128i out; + __m128i tmp; + __m128i mask = _mm_set1_epi32((1U<<3)-1); + __m128i conversion = _mm_set1_epi32(2147483648U); + __m128i key4 = _mm_set1_epi32(key - 2147483648U); + + + tmp = InReg; + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,3); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,6); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,9); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,12); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,15); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,18); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,21); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,24); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,27); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,30); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 3-1), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,1); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,4); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,7); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,10); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,13); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,16); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,19); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,22); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,25); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,28); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,31); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 3-2), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,2); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,5); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,8); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,11); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,14); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,17); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,20); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,23); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,26); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,29); + out = tmp; + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + *presult = key + 1; + return (128); +} + +static int +iunpacksearchwithlength4(__m128i initOffset, const __m128i *in, int length, + uint32_t key, uint32_t *presult) +{ + int i = 0; + __m128i InReg = _mm_loadu_si128(in); + __m128i out; + __m128i tmp; + __m128i mask = _mm_set1_epi32((1U<<4)-1); + __m128i conversion = _mm_set1_epi32(2147483648U); + __m128i key4 = _mm_set1_epi32(key - 2147483648U); + + + tmp = InReg; + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,4); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,8); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,12); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,16); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,20); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,24); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,28); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = InReg; + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,4); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,8); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,12); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,16); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,20); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,24); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,28); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = InReg; + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,4); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,8); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,12); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,16); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,20); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,24); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,28); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = InReg; + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,4); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,8); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,12); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,16); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,20); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,24); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,28); + out = tmp; + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + *presult = key + 1; + return (128); +} + +static int +iunpacksearchwithlength5(__m128i initOffset, const __m128i *in, int length, + uint32_t key, uint32_t *presult) +{ + int i = 0; + __m128i InReg = _mm_loadu_si128(in); + __m128i out; + __m128i tmp; + __m128i mask = _mm_set1_epi32((1U<<5)-1); + __m128i conversion = _mm_set1_epi32(2147483648U); + __m128i key4 = _mm_set1_epi32(key - 2147483648U); + + + tmp = InReg; + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,5); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,10); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,15); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,20); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,25); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,30); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 5-3), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,3); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,8); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,13); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,18); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,23); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,28); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 5-1), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,1); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,6); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,11); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,16); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,21); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,26); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,31); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 5-4), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,4); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,9); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,14); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,19); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,24); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,29); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 5-2), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,2); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,7); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,12); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,17); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,22); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,27); + out = tmp; + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + *presult = key + 1; + return (128); +} + +static int +iunpacksearchwithlength6(__m128i initOffset, const __m128i *in, int length, + uint32_t key, uint32_t *presult) +{ + int i = 0; + __m128i InReg = _mm_loadu_si128(in); + __m128i out; + __m128i tmp; + __m128i mask = _mm_set1_epi32((1U<<6)-1); + __m128i conversion = _mm_set1_epi32(2147483648U); + __m128i key4 = _mm_set1_epi32(key - 2147483648U); + + + tmp = InReg; + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,6); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,12); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,18); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,24); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,30); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 6-4), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,4); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,10); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,16); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,22); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,28); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 6-2), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,2); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,8); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,14); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,20); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,26); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = InReg; + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,6); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,12); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,18); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,24); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,30); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 6-4), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,4); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,10); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,16); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,22); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,28); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 6-2), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,2); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,8); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,14); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,20); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,26); + out = tmp; + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + *presult = key + 1; + return (128); +} + +static int +iunpacksearchwithlength7(__m128i initOffset, const __m128i *in, int length, + uint32_t key, uint32_t *presult) +{ + int i = 0; + __m128i InReg = _mm_loadu_si128(in); + __m128i out; + __m128i tmp; + __m128i mask = _mm_set1_epi32((1U<<7)-1); + __m128i conversion = _mm_set1_epi32(2147483648U); + __m128i key4 = _mm_set1_epi32(key - 2147483648U); + + + tmp = InReg; + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,7); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,14); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,21); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,28); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 7-3), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,3); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,10); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,17); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,24); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,31); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 7-6), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,6); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,13); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,20); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,27); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 7-2), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,2); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,9); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,16); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,23); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,30); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 7-5), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,5); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,12); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,19); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,26); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 7-1), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,1); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,8); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,15); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,22); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,29); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 7-4), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,4); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,11); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,18); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,25); + out = tmp; + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + *presult = key + 1; + return (128); +} + +static int +iunpacksearchwithlength8(__m128i initOffset, const __m128i *in, int length, + uint32_t key, uint32_t *presult) +{ + int i = 0; + __m128i InReg = _mm_loadu_si128(in); + __m128i out; + __m128i tmp; + __m128i mask = _mm_set1_epi32((1U<<8)-1); + __m128i conversion = _mm_set1_epi32(2147483648U); + __m128i key4 = _mm_set1_epi32(key - 2147483648U); + + + tmp = InReg; + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,8); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,16); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,24); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = InReg; + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,8); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,16); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,24); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = InReg; + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,8); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,16); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,24); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = InReg; + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,8); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,16); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,24); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = InReg; + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,8); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,16); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,24); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = InReg; + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,8); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,16); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,24); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = InReg; + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,8); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,16); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,24); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = InReg; + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,8); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,16); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,24); + out = tmp; + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + *presult = key + 1; + return (128); +} + +static int +iunpacksearchwithlength9(__m128i initOffset, const __m128i *in, int length, + uint32_t key, uint32_t *presult) +{ + int i = 0; + __m128i InReg = _mm_loadu_si128(in); + __m128i out; + __m128i tmp; + __m128i mask = _mm_set1_epi32((1U<<9)-1); + __m128i conversion = _mm_set1_epi32(2147483648U); + __m128i key4 = _mm_set1_epi32(key - 2147483648U); + + + tmp = InReg; + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,9); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,18); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,27); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 9-4), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,4); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,13); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,22); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,31); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 9-8), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,8); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,17); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,26); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 9-3), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,3); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,12); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,21); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,30); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 9-7), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,7); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,16); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,25); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 9-2), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,2); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,11); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,20); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,29); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 9-6), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,6); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,15); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,24); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 9-1), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,1); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,10); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,19); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,28); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 9-5), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,5); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,14); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,23); + out = tmp; + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + *presult = key + 1; + return (128); +} + +static int +iunpacksearchwithlength10(__m128i initOffset, const __m128i *in, int length, + uint32_t key, uint32_t *presult) +{ + int i = 0; + __m128i InReg = _mm_loadu_si128(in); + __m128i out; + __m128i tmp; + __m128i mask = _mm_set1_epi32((1U<<10)-1); + __m128i conversion = _mm_set1_epi32(2147483648U); + __m128i key4 = _mm_set1_epi32(key - 2147483648U); + + + tmp = InReg; + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,10); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,20); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,30); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 10-8), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,8); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,18); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,28); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 10-6), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,6); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,16); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,26); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 10-4), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,4); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,14); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,24); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 10-2), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,2); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,12); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,22); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = InReg; + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,10); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,20); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,30); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 10-8), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,8); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,18); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,28); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 10-6), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,6); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,16); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,26); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 10-4), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,4); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,14); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,24); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 10-2), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,2); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,12); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,22); + out = tmp; + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + *presult = key + 1; + return (128); +} + +static int +iunpacksearchwithlength11(__m128i initOffset, const __m128i *in, int length, + uint32_t key, uint32_t *presult) +{ + int i = 0; + __m128i InReg = _mm_loadu_si128(in); + __m128i out; + __m128i tmp; + __m128i mask = _mm_set1_epi32((1U<<11)-1); + __m128i conversion = _mm_set1_epi32(2147483648U); + __m128i key4 = _mm_set1_epi32(key - 2147483648U); + + + tmp = InReg; + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,11); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,22); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 11-1), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,1); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,12); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,23); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 11-2), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,2); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,13); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,24); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 11-3), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,3); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,14); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,25); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 11-4), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,4); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,15); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,26); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 11-5), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,5); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,16); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,27); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 11-6), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,6); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,17); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,28); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 11-7), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,7); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,18); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,29); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 11-8), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,8); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,19); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,30); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 11-9), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,9); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,20); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,31); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 11-10), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,10); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,21); + out = tmp; + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + *presult = key + 1; + return (128); +} + +static int +iunpacksearchwithlength12(__m128i initOffset, const __m128i *in, int length, + uint32_t key, uint32_t *presult) +{ + int i = 0; + __m128i InReg = _mm_loadu_si128(in); + __m128i out; + __m128i tmp; + __m128i mask = _mm_set1_epi32((1U<<12)-1); + __m128i conversion = _mm_set1_epi32(2147483648U); + __m128i key4 = _mm_set1_epi32(key - 2147483648U); + + + tmp = InReg; + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,12); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,24); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 12-4), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,4); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,16); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,28); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 12-8), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,8); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,20); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = InReg; + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,12); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,24); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 12-4), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,4); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,16); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,28); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 12-8), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,8); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,20); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = InReg; + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,12); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,24); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 12-4), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,4); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,16); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,28); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 12-8), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,8); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,20); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = InReg; + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,12); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,24); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 12-4), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,4); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,16); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,28); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 12-8), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,8); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,20); + out = tmp; + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + *presult = key + 1; + return (128); +} + +static int +iunpacksearchwithlength13(__m128i initOffset, const __m128i *in, int length, + uint32_t key, uint32_t *presult) +{ + int i = 0; + __m128i InReg = _mm_loadu_si128(in); + __m128i out; + __m128i tmp; + __m128i mask = _mm_set1_epi32((1U<<13)-1); + __m128i conversion = _mm_set1_epi32(2147483648U); + __m128i key4 = _mm_set1_epi32(key - 2147483648U); + + + tmp = InReg; + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,13); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,26); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 13-7), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,7); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,20); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 13-1), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,1); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,14); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,27); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 13-8), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,8); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,21); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 13-2), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,2); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,15); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,28); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 13-9), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,9); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,22); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 13-3), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,3); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,16); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,29); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 13-10), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,10); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,23); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 13-4), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,4); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,17); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,30); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 13-11), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,11); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,24); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 13-5), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,5); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,18); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,31); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 13-12), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,12); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,25); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 13-6), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,6); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,19); + out = tmp; + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + *presult = key + 1; + return (128); +} + +static int +iunpacksearchwithlength14(__m128i initOffset, const __m128i *in, int length, + uint32_t key, uint32_t *presult) +{ + int i = 0; + __m128i InReg = _mm_loadu_si128(in); + __m128i out; + __m128i tmp; + __m128i mask = _mm_set1_epi32((1U<<14)-1); + __m128i conversion = _mm_set1_epi32(2147483648U); + __m128i key4 = _mm_set1_epi32(key - 2147483648U); + + + tmp = InReg; + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,14); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,28); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 14-10), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,10); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,24); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 14-6), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,6); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,20); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 14-2), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,2); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,16); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,30); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 14-12), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,12); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,26); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 14-8), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,8); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,22); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 14-4), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,4); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,18); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = InReg; + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,14); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,28); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 14-10), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,10); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,24); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 14-6), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,6); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,20); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 14-2), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,2); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,16); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,30); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 14-12), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,12); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,26); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 14-8), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,8); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,22); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 14-4), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,4); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,18); + out = tmp; + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + *presult = key + 1; + return (128); +} + +static int +iunpacksearchwithlength15(__m128i initOffset, const __m128i *in, int length, + uint32_t key, uint32_t *presult) +{ + int i = 0; + __m128i InReg = _mm_loadu_si128(in); + __m128i out; + __m128i tmp; + __m128i mask = _mm_set1_epi32((1U<<15)-1); + __m128i conversion = _mm_set1_epi32(2147483648U); + __m128i key4 = _mm_set1_epi32(key - 2147483648U); + + + tmp = InReg; + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,15); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,30); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 15-13), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,13); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,28); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 15-11), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,11); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,26); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 15-9), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,9); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,24); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 15-7), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,7); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,22); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 15-5), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,5); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,20); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 15-3), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,3); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,18); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 15-1), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,1); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,16); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,31); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 15-14), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,14); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,29); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 15-12), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,12); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,27); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 15-10), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,10); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,25); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 15-8), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,8); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,23); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 15-6), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,6); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,21); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 15-4), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,4); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,19); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 15-2), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,2); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,17); + out = tmp; + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + *presult = key + 1; + return (128); +} + +static int +iunpacksearchwithlength16(__m128i initOffset, const __m128i *in, int length, + uint32_t key, uint32_t *presult) +{ + int i = 0; + __m128i InReg = _mm_loadu_si128(in); + __m128i out; + __m128i tmp; + __m128i mask = _mm_set1_epi32((1U<<16)-1); + __m128i conversion = _mm_set1_epi32(2147483648U); + __m128i key4 = _mm_set1_epi32(key - 2147483648U); + + + tmp = InReg; + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,16); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = InReg; + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,16); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = InReg; + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,16); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = InReg; + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,16); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = InReg; + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,16); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = InReg; + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,16); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = InReg; + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,16); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = InReg; + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,16); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = InReg; + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,16); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = InReg; + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,16); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = InReg; + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,16); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = InReg; + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,16); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = InReg; + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,16); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = InReg; + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,16); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = InReg; + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,16); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = InReg; + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,16); + out = tmp; + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + *presult = key + 1; + return (128); +} + +static int +iunpacksearchwithlength17(__m128i initOffset, const __m128i *in, int length, + uint32_t key, uint32_t *presult) +{ + int i = 0; + __m128i InReg = _mm_loadu_si128(in); + __m128i out; + __m128i tmp; + __m128i mask = _mm_set1_epi32((1U<<17)-1); + __m128i conversion = _mm_set1_epi32(2147483648U); + __m128i key4 = _mm_set1_epi32(key - 2147483648U); + + + tmp = InReg; + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,17); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 17-2), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,2); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,19); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 17-4), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,4); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,21); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 17-6), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,6); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,23); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 17-8), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,8); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,25); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 17-10), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,10); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,27); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 17-12), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,12); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,29); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 17-14), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,14); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,31); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 17-16), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,16); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 17-1), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,1); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,18); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 17-3), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,3); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,20); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 17-5), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,5); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,22); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 17-7), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,7); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,24); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 17-9), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,9); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,26); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 17-11), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,11); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,28); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 17-13), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,13); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,30); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 17-15), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,15); + out = tmp; + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + *presult = key + 1; + return (128); +} + +static int +iunpacksearchwithlength18(__m128i initOffset, const __m128i *in, int length, + uint32_t key, uint32_t *presult) +{ + int i = 0; + __m128i InReg = _mm_loadu_si128(in); + __m128i out; + __m128i tmp; + __m128i mask = _mm_set1_epi32((1U<<18)-1); + __m128i conversion = _mm_set1_epi32(2147483648U); + __m128i key4 = _mm_set1_epi32(key - 2147483648U); + + + tmp = InReg; + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,18); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 18-4), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,4); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,22); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 18-8), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,8); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,26); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 18-12), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,12); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,30); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 18-16), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,16); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 18-2), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,2); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,20); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 18-6), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,6); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,24); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 18-10), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,10); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,28); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 18-14), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,14); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = InReg; + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,18); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 18-4), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,4); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,22); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 18-8), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,8); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,26); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 18-12), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,12); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,30); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 18-16), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,16); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 18-2), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,2); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,20); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 18-6), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,6); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,24); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 18-10), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,10); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,28); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 18-14), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,14); + out = tmp; + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + *presult = key + 1; + return (128); +} + +static int +iunpacksearchwithlength19(__m128i initOffset, const __m128i *in, int length, + uint32_t key, uint32_t *presult) +{ + int i = 0; + __m128i InReg = _mm_loadu_si128(in); + __m128i out; + __m128i tmp; + __m128i mask = _mm_set1_epi32((1U<<19)-1); + __m128i conversion = _mm_set1_epi32(2147483648U); + __m128i key4 = _mm_set1_epi32(key - 2147483648U); + + + tmp = InReg; + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,19); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 19-6), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,6); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,25); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 19-12), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,12); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,31); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 19-18), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,18); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 19-5), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,5); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,24); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 19-11), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,11); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,30); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 19-17), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,17); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 19-4), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,4); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,23); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 19-10), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,10); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,29); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 19-16), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,16); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 19-3), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,3); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,22); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 19-9), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,9); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,28); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 19-15), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,15); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 19-2), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,2); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,21); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 19-8), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,8); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,27); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 19-14), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,14); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 19-1), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,1); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,20); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 19-7), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,7); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,26); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 19-13), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,13); + out = tmp; + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + *presult = key + 1; + return (128); +} + +static int +iunpacksearchwithlength20(__m128i initOffset, const __m128i *in, int length, + uint32_t key, uint32_t *presult) +{ + int i = 0; + __m128i InReg = _mm_loadu_si128(in); + __m128i out; + __m128i tmp; + __m128i mask = _mm_set1_epi32((1U<<20)-1); + __m128i conversion = _mm_set1_epi32(2147483648U); + __m128i key4 = _mm_set1_epi32(key - 2147483648U); + + + tmp = InReg; + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,20); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 20-8), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,8); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,28); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 20-16), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,16); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 20-4), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,4); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,24); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 20-12), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,12); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = InReg; + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,20); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 20-8), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,8); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,28); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 20-16), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,16); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 20-4), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,4); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,24); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 20-12), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,12); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = InReg; + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,20); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 20-8), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,8); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,28); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 20-16), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,16); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 20-4), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,4); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,24); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 20-12), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,12); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = InReg; + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,20); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 20-8), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,8); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,28); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 20-16), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,16); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 20-4), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,4); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,24); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 20-12), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,12); + out = tmp; + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + *presult = key + 1; + return (128); +} + +static int +iunpacksearchwithlength21(__m128i initOffset, const __m128i *in, int length, + uint32_t key, uint32_t *presult) +{ + int i = 0; + __m128i InReg = _mm_loadu_si128(in); + __m128i out; + __m128i tmp; + __m128i mask = _mm_set1_epi32((1U<<21)-1); + __m128i conversion = _mm_set1_epi32(2147483648U); + __m128i key4 = _mm_set1_epi32(key - 2147483648U); + + + tmp = InReg; + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,21); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 21-10), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,10); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,31); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 21-20), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,20); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 21-9), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,9); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,30); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 21-19), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,19); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 21-8), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,8); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,29); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 21-18), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,18); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 21-7), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,7); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,28); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 21-17), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,17); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 21-6), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,6); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,27); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 21-16), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,16); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 21-5), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,5); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,26); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 21-15), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,15); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 21-4), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,4); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,25); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 21-14), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,14); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 21-3), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,3); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,24); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 21-13), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,13); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 21-2), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,2); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,23); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 21-12), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,12); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 21-1), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,1); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,22); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 21-11), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,11); + out = tmp; + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + *presult = key + 1; + return (128); +} + +static int +iunpacksearchwithlength22(__m128i initOffset, const __m128i *in, int length, + uint32_t key, uint32_t *presult) +{ + int i = 0; + __m128i InReg = _mm_loadu_si128(in); + __m128i out; + __m128i tmp; + __m128i mask = _mm_set1_epi32((1U<<22)-1); + __m128i conversion = _mm_set1_epi32(2147483648U); + __m128i key4 = _mm_set1_epi32(key - 2147483648U); + + + tmp = InReg; + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,22); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 22-12), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,12); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 22-2), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,2); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,24); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 22-14), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,14); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 22-4), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,4); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,26); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 22-16), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,16); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 22-6), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,6); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,28); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 22-18), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,18); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 22-8), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,8); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,30); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 22-20), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,20); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 22-10), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,10); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = InReg; + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,22); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 22-12), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,12); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 22-2), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,2); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,24); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 22-14), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,14); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 22-4), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,4); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,26); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 22-16), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,16); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 22-6), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,6); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,28); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 22-18), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,18); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 22-8), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,8); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,30); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 22-20), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,20); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 22-10), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,10); + out = tmp; + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + *presult = key + 1; + return (128); +} + +static int +iunpacksearchwithlength23(__m128i initOffset, const __m128i *in, int length, + uint32_t key, uint32_t *presult) +{ + int i = 0; + __m128i InReg = _mm_loadu_si128(in); + __m128i out; + __m128i tmp; + __m128i mask = _mm_set1_epi32((1U<<23)-1); + __m128i conversion = _mm_set1_epi32(2147483648U); + __m128i key4 = _mm_set1_epi32(key - 2147483648U); + + + tmp = InReg; + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,23); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 23-14), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,14); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 23-5), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,5); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,28); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 23-19), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,19); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 23-10), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,10); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 23-1), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,1); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,24); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 23-15), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,15); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 23-6), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,6); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,29); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 23-20), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,20); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 23-11), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,11); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 23-2), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,2); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,25); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 23-16), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,16); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 23-7), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,7); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,30); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 23-21), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,21); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 23-12), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,12); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 23-3), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,3); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,26); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 23-17), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,17); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 23-8), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,8); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,31); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 23-22), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,22); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 23-13), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,13); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 23-4), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,4); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,27); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 23-18), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,18); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 23-9), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,9); + out = tmp; + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + *presult = key + 1; + return (128); +} + +static int +iunpacksearchwithlength24(__m128i initOffset, const __m128i *in, int length, + uint32_t key, uint32_t *presult) +{ + int i = 0; + __m128i InReg = _mm_loadu_si128(in); + __m128i out; + __m128i tmp; + __m128i mask = _mm_set1_epi32((1U<<24)-1); + __m128i conversion = _mm_set1_epi32(2147483648U); + __m128i key4 = _mm_set1_epi32(key - 2147483648U); + + + tmp = InReg; + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,24); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 24-16), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,16); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 24-8), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,8); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = InReg; + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,24); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 24-16), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,16); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 24-8), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,8); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = InReg; + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,24); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 24-16), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,16); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 24-8), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,8); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = InReg; + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,24); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 24-16), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,16); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 24-8), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,8); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = InReg; + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,24); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 24-16), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,16); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 24-8), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,8); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = InReg; + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,24); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 24-16), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,16); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 24-8), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,8); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = InReg; + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,24); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 24-16), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,16); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 24-8), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,8); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = InReg; + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,24); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 24-16), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,16); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 24-8), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,8); + out = tmp; + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + *presult = key + 1; + return (128); +} + +static int +iunpacksearchwithlength25(__m128i initOffset, const __m128i *in, int length, + uint32_t key, uint32_t *presult) +{ + int i = 0; + __m128i InReg = _mm_loadu_si128(in); + __m128i out; + __m128i tmp; + __m128i mask = _mm_set1_epi32((1U<<25)-1); + __m128i conversion = _mm_set1_epi32(2147483648U); + __m128i key4 = _mm_set1_epi32(key - 2147483648U); + + + tmp = InReg; + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,25); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 25-18), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,18); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 25-11), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,11); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 25-4), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,4); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,29); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 25-22), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,22); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 25-15), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,15); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 25-8), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,8); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 25-1), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,1); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,26); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 25-19), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,19); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 25-12), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,12); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 25-5), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,5); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,30); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 25-23), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,23); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 25-16), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,16); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 25-9), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,9); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 25-2), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,2); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,27); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 25-20), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,20); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 25-13), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,13); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 25-6), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,6); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,31); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 25-24), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,24); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 25-17), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,17); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 25-10), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,10); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 25-3), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,3); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,28); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 25-21), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,21); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 25-14), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,14); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 25-7), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,7); + out = tmp; + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + *presult = key + 1; + return (128); +} + +static int +iunpacksearchwithlength26(__m128i initOffset, const __m128i *in, int length, + uint32_t key, uint32_t *presult) +{ + int i = 0; + __m128i InReg = _mm_loadu_si128(in); + __m128i out; + __m128i tmp; + __m128i mask = _mm_set1_epi32((1U<<26)-1); + __m128i conversion = _mm_set1_epi32(2147483648U); + __m128i key4 = _mm_set1_epi32(key - 2147483648U); + + + tmp = InReg; + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,26); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 26-20), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,20); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 26-14), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,14); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 26-8), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,8); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 26-2), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,2); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,28); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 26-22), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,22); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 26-16), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,16); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 26-10), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,10); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 26-4), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,4); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,30); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 26-24), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,24); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 26-18), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,18); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 26-12), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,12); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 26-6), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,6); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = InReg; + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,26); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 26-20), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,20); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 26-14), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,14); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 26-8), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,8); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 26-2), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,2); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,28); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 26-22), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,22); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 26-16), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,16); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 26-10), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,10); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 26-4), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,4); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,30); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 26-24), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,24); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 26-18), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,18); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 26-12), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,12); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 26-6), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,6); + out = tmp; + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + *presult = key + 1; + return (128); +} + +static int +iunpacksearchwithlength27(__m128i initOffset, const __m128i *in, int length, + uint32_t key, uint32_t *presult) +{ + int i = 0; + __m128i InReg = _mm_loadu_si128(in); + __m128i out; + __m128i tmp; + __m128i mask = _mm_set1_epi32((1U<<27)-1); + __m128i conversion = _mm_set1_epi32(2147483648U); + __m128i key4 = _mm_set1_epi32(key - 2147483648U); + + + tmp = InReg; + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,27); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 27-22), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,22); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 27-17), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,17); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 27-12), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,12); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 27-7), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,7); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 27-2), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,2); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,29); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 27-24), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,24); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 27-19), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,19); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 27-14), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,14); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 27-9), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,9); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 27-4), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,4); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,31); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 27-26), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,26); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 27-21), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,21); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 27-16), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,16); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 27-11), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,11); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 27-6), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,6); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 27-1), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,1); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,28); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 27-23), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,23); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 27-18), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,18); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 27-13), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,13); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 27-8), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,8); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 27-3), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,3); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,30); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 27-25), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,25); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 27-20), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,20); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 27-15), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,15); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 27-10), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,10); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 27-5), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,5); + out = tmp; + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + *presult = key + 1; + return (128); +} + +static int +iunpacksearchwithlength28(__m128i initOffset, const __m128i *in, int length, + uint32_t key, uint32_t *presult) +{ + int i = 0; + __m128i InReg = _mm_loadu_si128(in); + __m128i out; + __m128i tmp; + __m128i mask = _mm_set1_epi32((1U<<28)-1); + __m128i conversion = _mm_set1_epi32(2147483648U); + __m128i key4 = _mm_set1_epi32(key - 2147483648U); + + + tmp = InReg; + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,28); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 28-24), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,24); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 28-20), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,20); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 28-16), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,16); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 28-12), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,12); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 28-8), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,8); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 28-4), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,4); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = InReg; + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,28); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 28-24), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,24); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 28-20), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,20); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 28-16), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,16); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 28-12), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,12); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 28-8), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,8); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 28-4), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,4); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = InReg; + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,28); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 28-24), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,24); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 28-20), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,20); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 28-16), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,16); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 28-12), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,12); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 28-8), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,8); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 28-4), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,4); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = InReg; + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,28); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 28-24), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,24); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 28-20), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,20); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 28-16), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,16); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 28-12), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,12); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 28-8), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,8); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 28-4), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,4); + out = tmp; + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + *presult = key + 1; + return (128); +} + +static int +iunpacksearchwithlength29(__m128i initOffset, const __m128i *in, int length, + uint32_t key, uint32_t *presult) +{ + int i = 0; + __m128i InReg = _mm_loadu_si128(in); + __m128i out; + __m128i tmp; + __m128i mask = _mm_set1_epi32((1U<<29)-1); + __m128i conversion = _mm_set1_epi32(2147483648U); + __m128i key4 = _mm_set1_epi32(key - 2147483648U); + + + tmp = InReg; + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,29); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 29-26), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,26); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 29-23), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,23); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 29-20), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,20); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 29-17), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,17); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 29-14), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,14); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 29-11), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,11); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 29-8), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,8); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 29-5), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,5); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 29-2), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,2); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,31); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 29-28), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,28); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 29-25), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,25); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 29-22), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,22); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 29-19), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,19); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 29-16), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,16); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 29-13), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,13); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 29-10), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,10); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 29-7), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,7); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 29-4), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,4); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 29-1), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,1); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,30); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 29-27), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,27); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 29-24), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,24); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 29-21), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,21); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 29-18), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,18); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 29-15), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,15); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 29-12), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,12); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 29-9), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,9); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 29-6), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,6); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 29-3), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,3); + out = tmp; + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + *presult = key + 1; + return (128); +} + +static int +iunpacksearchwithlength30(__m128i initOffset, const __m128i *in, int length, + uint32_t key, uint32_t *presult) +{ + int i = 0; + __m128i InReg = _mm_loadu_si128(in); + __m128i out; + __m128i tmp; + __m128i mask = _mm_set1_epi32((1U<<30)-1); + __m128i conversion = _mm_set1_epi32(2147483648U); + __m128i key4 = _mm_set1_epi32(key - 2147483648U); + + tmp = InReg; + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,30); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 30-28), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,28); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 30-26), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,26); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 30-24), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,24); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 30-22), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,22); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 30-20), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,20); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 30-18), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,18); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 30-16), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,16); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 30-14), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,14); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 30-12), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,12); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 30-10), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,10); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 30-8), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,8); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 30-6), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,6); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 30-4), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,4); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 30-2), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,2); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = InReg; + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,30); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 30-28), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,28); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 30-26), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,26); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 30-24), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,24); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 30-22), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,22); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 30-20), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,20); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 30-18), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,18); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 30-16), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,16); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 30-14), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,14); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 30-12), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,12); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 30-10), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,10); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 30-8), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,8); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 30-6), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,6); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 30-4), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,4); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 30-2), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,2); + out = tmp; + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + *presult = key + 1; + return (128); +} + +static int +iunpacksearchwithlength31(__m128i initOffset, const __m128i *in, int length, + uint32_t key, uint32_t *presult) +{ + int i = 0; + __m128i InReg = _mm_loadu_si128(in); + __m128i out; + __m128i tmp; + __m128i mask = _mm_set1_epi32((1U<<31)-1); + __m128i conversion = _mm_set1_epi32(2147483648U); + __m128i key4 = _mm_set1_epi32(key - 2147483648U); + + + tmp = InReg; + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,31); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 31-30), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,30); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 31-29), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,29); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 31-28), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,28); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 31-27), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,27); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 31-26), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,26); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 31-25), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,25); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 31-24), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,24); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 31-23), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,23); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 31-22), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,22); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 31-21), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,21); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 31-20), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,20); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 31-19), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,19); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 31-18), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,18); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 31-17), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,17); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 31-16), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,16); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 31-15), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,15); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 31-14), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,14); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 31-13), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,13); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 31-12), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,12); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 31-11), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,11); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 31-10), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,10); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 31-9), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,9); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 31-8), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,8); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 31-7), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,7); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 31-6), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,6); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 31-5), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,5); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 31-4), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,4); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 31-3), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,3); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 31-2), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,2); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 31-1), mask)); + + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + tmp = _mm_srli_epi32(InReg,1); + out = tmp; + PrefixSum(out, out, initOffset); + initOffset = out; + CHECK_AND_INCREMENT_WITH_LENGTH(i, out, length, key, presult); + + *presult = key + 1; + return (128); +} + + +static int +iunpacksearchwithlength32(__m128i initOffset, const __m128i *in, int length, + uint32_t key, uint32_t *presult) +{ + uint32_t * in32 = (uint32_t *)in; + int answer = lower_bound(in32, key, 0, length); + if(in32[answer] < key) { + *presult = key + 1; + return (length); + } + (void) initOffset; + *presult = in32[answer]; + return answer; +} + + +int +simdsearchwithlengthd1(uint32_t initvalue, const __m128i *in, uint32_t bit, int length, + uint32_t key, uint32_t *presult) +{ + __m128i initOffset = _mm_set1_epi32 (initvalue); + switch (bit) { + case 0: return iunpacksearchwithlength0(initOffset, in, length, key, presult); + + case 1: return iunpacksearchwithlength1(initOffset, in, length, key, presult); + + case 2: return iunpacksearchwithlength2(initOffset, in, length, key, presult); + + case 3: return iunpacksearchwithlength3(initOffset, in, length, key, presult); + + case 4: return iunpacksearchwithlength4(initOffset, in, length, key, presult); + + case 5: return iunpacksearchwithlength5(initOffset, in, length, key, presult); + + case 6: return iunpacksearchwithlength6(initOffset, in, length, key, presult); + + case 7: return iunpacksearchwithlength7(initOffset, in, length, key, presult); + + case 8: return iunpacksearchwithlength8(initOffset, in, length, key, presult); + + case 9: return iunpacksearchwithlength9(initOffset, in, length, key, presult); + + case 10: return iunpacksearchwithlength10(initOffset, in, length, key, presult); + + case 11: return iunpacksearchwithlength11(initOffset, in, length, key, presult); + + case 12: return iunpacksearchwithlength12(initOffset, in, length, key, presult); + + case 13: return iunpacksearchwithlength13(initOffset, in, length, key, presult); + + case 14: return iunpacksearchwithlength14(initOffset, in, length, key, presult); + + case 15: return iunpacksearchwithlength15(initOffset, in, length, key, presult); + + case 16: return iunpacksearchwithlength16(initOffset, in, length, key, presult); + + case 17: return iunpacksearchwithlength17(initOffset, in, length, key, presult); + + case 18: return iunpacksearchwithlength18(initOffset, in, length, key, presult); + + case 19: return iunpacksearchwithlength19(initOffset, in, length, key, presult); + + case 20: return iunpacksearchwithlength20(initOffset, in, length, key, presult); + + case 21: return iunpacksearchwithlength21(initOffset, in, length, key, presult); + + case 22: return iunpacksearchwithlength22(initOffset, in, length, key, presult); + + case 23: return iunpacksearchwithlength23(initOffset, in, length, key, presult); + + case 24: return iunpacksearchwithlength24(initOffset, in, length, key, presult); + + case 25: return iunpacksearchwithlength25(initOffset, in, length, key, presult); + + case 26: return iunpacksearchwithlength26(initOffset, in, length, key, presult); + + case 27: return iunpacksearchwithlength27(initOffset, in, length, key, presult); + + case 28: return iunpacksearchwithlength28(initOffset, in, length, key, presult); + + case 29: return iunpacksearchwithlength29(initOffset, in, length, key, presult); + + case 30: return iunpacksearchwithlength30(initOffset, in, length, key, presult); + + case 31: return iunpacksearchwithlength31(initOffset, in, length, key, presult); + + case 32: return iunpacksearchwithlength32(initOffset, in, length, key, presult); + + default: break; + } + return (-1); +} + + + + + +/* perform a lower-bound search for |key| in |out|; the resulting uint32 +* is stored in |*presult|.*/ +#define CHECK_AND_INCREMENT(i, out, key, presult) \ + do { \ + __m128i tmpout = _mm_sub_epi32(out, conversion); \ + uint32_t mmask = _mm_movemask_ps(_mm_castsi128_ps(_mm_cmplt_epi32(tmpout, key4))); \ + if (mmask != 15) { \ + __m128i p = _mm_shuffle_epi8(out, shuffle_mask[mmask ^ 15]); \ + int offset; \ + SIMDCOMP_CTZ(offset, mmask ^ 15); \ + *presult = _mm_cvtsi128_si32(p); \ + return (i + offset); \ + } \ + i += 4; \ + } while (0) + +static int +iunpacksearch0(__m128i * initOffset , const __m128i * _in, + uint32_t key, uint32_t *presult) +{ + uint32_t repeatedvalue = (uint32_t) _mm_extract_epi32(*initOffset, 3); + if (repeatedvalue >= key) { + *presult = repeatedvalue; + return 0; + } + *presult = key + 1; + (void)_in; + return (128); +} + +static int +iunpacksearch1(__m128i * initOffset, const __m128i *in, + uint32_t key, uint32_t *presult) +{ + int i = 0; + __m128i InReg = _mm_loadu_si128(in); + __m128i out; + __m128i tmp; + __m128i mask = _mm_set1_epi32((1U<<1)-1); + __m128i conversion = _mm_set1_epi32(2147483648U); + __m128i key4 = _mm_set1_epi32(key - 2147483648U); + + + tmp = InReg; + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,1); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,2); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,3); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,4); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,5); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,6); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,7); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,8); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,9); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,10); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,11); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,12); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,13); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,14); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,15); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,16); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,17); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,18); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,19); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,20); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,21); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,22); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,23); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,24); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,25); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,26); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,27); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,28); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,29); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,30); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,31); + out = tmp; + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + *presult = key + 1; + return (128); +} + +static int +iunpacksearch2(__m128i * initOffset, const __m128i *in, + uint32_t key, uint32_t *presult) +{ + int i = 0; + __m128i InReg = _mm_loadu_si128(in); + __m128i out; + __m128i tmp; + __m128i mask = _mm_set1_epi32((1U<<2)-1); + __m128i conversion = _mm_set1_epi32(2147483648U); + __m128i key4 = _mm_set1_epi32(key - 2147483648U); + + + tmp = InReg; + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,2); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,4); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,6); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,8); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,10); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,12); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,14); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,16); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,18); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,20); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,22); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,24); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,26); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,28); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,30); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = InReg; + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,2); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,4); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,6); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,8); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,10); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,12); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,14); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,16); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,18); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,20); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,22); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,24); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,26); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,28); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,30); + out = tmp; + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + *presult = key + 1; + return (128); +} + +static int +iunpacksearch3(__m128i * initOffset, const __m128i *in, + uint32_t key, uint32_t *presult) +{ + int i = 0; + __m128i InReg = _mm_loadu_si128(in); + __m128i out; + __m128i tmp; + __m128i mask = _mm_set1_epi32((1U<<3)-1); + __m128i conversion = _mm_set1_epi32(2147483648U); + __m128i key4 = _mm_set1_epi32(key - 2147483648U); + + + tmp = InReg; + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,3); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,6); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,9); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,12); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,15); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,18); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,21); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,24); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,27); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,30); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 3-1), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,1); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,4); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,7); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,10); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,13); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,16); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,19); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,22); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,25); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,28); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,31); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 3-2), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,2); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,5); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,8); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,11); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,14); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,17); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,20); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,23); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,26); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,29); + out = tmp; + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + *presult = key + 1; + return (128); +} + +static int +iunpacksearch4(__m128i * initOffset, const __m128i *in, + uint32_t key, uint32_t *presult) +{ + int i = 0; + __m128i InReg = _mm_loadu_si128(in); + __m128i out; + __m128i tmp; + __m128i mask = _mm_set1_epi32((1U<<4)-1); + __m128i conversion = _mm_set1_epi32(2147483648U); + __m128i key4 = _mm_set1_epi32(key - 2147483648U); + + + tmp = InReg; + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,4); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,8); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,12); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,16); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,20); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,24); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,28); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = InReg; + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,4); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,8); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,12); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,16); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,20); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,24); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,28); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = InReg; + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,4); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,8); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,12); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,16); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,20); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,24); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,28); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = InReg; + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,4); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,8); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,12); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,16); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,20); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,24); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,28); + out = tmp; + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + *presult = key + 1; + return (128); +} + +static int +iunpacksearch5(__m128i * initOffset, const __m128i *in, + uint32_t key, uint32_t *presult) +{ + int i = 0; + __m128i InReg = _mm_loadu_si128(in); + __m128i out; + __m128i tmp; + __m128i mask = _mm_set1_epi32((1U<<5)-1); + __m128i conversion = _mm_set1_epi32(2147483648U); + __m128i key4 = _mm_set1_epi32(key - 2147483648U); + + + tmp = InReg; + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,5); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,10); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,15); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,20); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,25); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,30); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 5-3), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,3); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,8); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,13); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,18); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,23); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,28); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 5-1), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,1); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,6); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,11); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,16); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,21); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,26); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,31); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 5-4), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,4); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,9); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,14); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,19); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,24); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,29); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 5-2), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,2); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,7); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,12); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,17); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,22); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,27); + out = tmp; + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + *presult = key + 1; + return (128); +} + +static int +iunpacksearch6(__m128i * initOffset, const __m128i *in, + uint32_t key, uint32_t *presult) +{ + int i = 0; + __m128i InReg = _mm_loadu_si128(in); + __m128i out; + __m128i tmp; + __m128i mask = _mm_set1_epi32((1U<<6)-1); + __m128i conversion = _mm_set1_epi32(2147483648U); + __m128i key4 = _mm_set1_epi32(key - 2147483648U); + + + tmp = InReg; + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,6); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,12); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,18); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,24); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,30); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 6-4), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,4); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,10); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,16); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,22); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,28); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 6-2), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,2); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,8); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,14); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,20); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,26); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = InReg; + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,6); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,12); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,18); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,24); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,30); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 6-4), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,4); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,10); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,16); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,22); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,28); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 6-2), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,2); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,8); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,14); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,20); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,26); + out = tmp; + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + *presult = key + 1; + return (128); +} + +static int +iunpacksearch7(__m128i * initOffset, const __m128i *in, + uint32_t key, uint32_t *presult) +{ + int i = 0; + __m128i InReg = _mm_loadu_si128(in); + __m128i out; + __m128i tmp; + __m128i mask = _mm_set1_epi32((1U<<7)-1); + __m128i conversion = _mm_set1_epi32(2147483648U); + __m128i key4 = _mm_set1_epi32(key - 2147483648U); + + + tmp = InReg; + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,7); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,14); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,21); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,28); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 7-3), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,3); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,10); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,17); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,24); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,31); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 7-6), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,6); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,13); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,20); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,27); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 7-2), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,2); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,9); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,16); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,23); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,30); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 7-5), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,5); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,12); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,19); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,26); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 7-1), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,1); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,8); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,15); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,22); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,29); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 7-4), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,4); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,11); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,18); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,25); + out = tmp; + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + *presult = key + 1; + return (128); +} + +static int +iunpacksearch8(__m128i * initOffset, const __m128i *in, + uint32_t key, uint32_t *presult) +{ + int i = 0; + __m128i InReg = _mm_loadu_si128(in); + __m128i out; + __m128i tmp; + __m128i mask = _mm_set1_epi32((1U<<8)-1); + __m128i conversion = _mm_set1_epi32(2147483648U); + __m128i key4 = _mm_set1_epi32(key - 2147483648U); + + + tmp = InReg; + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,8); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,16); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,24); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = InReg; + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,8); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,16); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,24); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = InReg; + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,8); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,16); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,24); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = InReg; + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,8); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,16); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,24); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = InReg; + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,8); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,16); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,24); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = InReg; + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,8); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,16); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,24); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = InReg; + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,8); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,16); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,24); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = InReg; + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,8); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,16); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,24); + out = tmp; + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + *presult = key + 1; + return (128); +} + +static int +iunpacksearch9(__m128i * initOffset, const __m128i *in, + uint32_t key, uint32_t *presult) +{ + int i = 0; + __m128i InReg = _mm_loadu_si128(in); + __m128i out; + __m128i tmp; + __m128i mask = _mm_set1_epi32((1U<<9)-1); + __m128i conversion = _mm_set1_epi32(2147483648U); + __m128i key4 = _mm_set1_epi32(key - 2147483648U); + + + tmp = InReg; + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,9); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,18); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,27); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 9-4), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,4); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,13); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,22); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,31); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 9-8), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,8); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,17); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,26); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 9-3), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,3); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,12); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,21); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,30); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 9-7), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,7); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,16); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,25); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 9-2), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,2); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,11); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,20); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,29); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 9-6), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,6); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,15); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,24); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 9-1), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,1); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,10); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,19); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,28); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 9-5), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,5); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,14); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,23); + out = tmp; + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + *presult = key + 1; + return (128); +} + +static int +iunpacksearch10(__m128i * initOffset, const __m128i *in, + uint32_t key, uint32_t *presult) +{ + int i = 0; + __m128i InReg = _mm_loadu_si128(in); + __m128i out; + __m128i tmp; + __m128i mask = _mm_set1_epi32((1U<<10)-1); + __m128i conversion = _mm_set1_epi32(2147483648U); + __m128i key4 = _mm_set1_epi32(key - 2147483648U); + + + tmp = InReg; + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,10); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,20); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,30); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 10-8), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,8); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,18); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,28); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 10-6), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,6); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,16); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,26); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 10-4), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,4); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,14); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,24); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 10-2), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,2); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,12); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,22); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = InReg; + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,10); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,20); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,30); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 10-8), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,8); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,18); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,28); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 10-6), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,6); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,16); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,26); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 10-4), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,4); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,14); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,24); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 10-2), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,2); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,12); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,22); + out = tmp; + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + *presult = key + 1; + return (128); +} + +static int +iunpacksearch11(__m128i * initOffset, const __m128i *in, + uint32_t key, uint32_t *presult) +{ + int i = 0; + __m128i InReg = _mm_loadu_si128(in); + __m128i out; + __m128i tmp; + __m128i mask = _mm_set1_epi32((1U<<11)-1); + __m128i conversion = _mm_set1_epi32(2147483648U); + __m128i key4 = _mm_set1_epi32(key - 2147483648U); + + + tmp = InReg; + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,11); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,22); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 11-1), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,1); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,12); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,23); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 11-2), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,2); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,13); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,24); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 11-3), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,3); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,14); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,25); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 11-4), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,4); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,15); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,26); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 11-5), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,5); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,16); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,27); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 11-6), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,6); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,17); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,28); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 11-7), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,7); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,18); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,29); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 11-8), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,8); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,19); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,30); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 11-9), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,9); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,20); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,31); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 11-10), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,10); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,21); + out = tmp; + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + *presult = key + 1; + return (128); +} + +static int +iunpacksearch12(__m128i * initOffset, const __m128i *in, + uint32_t key, uint32_t *presult) +{ + int i = 0; + __m128i InReg = _mm_loadu_si128(in); + __m128i out; + __m128i tmp; + __m128i mask = _mm_set1_epi32((1U<<12)-1); + __m128i conversion = _mm_set1_epi32(2147483648U); + __m128i key4 = _mm_set1_epi32(key - 2147483648U); + + + tmp = InReg; + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,12); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,24); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 12-4), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,4); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,16); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,28); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 12-8), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,8); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,20); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = InReg; + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,12); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,24); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 12-4), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,4); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,16); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,28); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 12-8), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,8); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,20); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = InReg; + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,12); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,24); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 12-4), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,4); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,16); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,28); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 12-8), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,8); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,20); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = InReg; + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,12); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,24); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 12-4), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,4); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,16); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,28); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 12-8), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,8); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,20); + out = tmp; + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + *presult = key + 1; + return (128); +} + +static int +iunpacksearch13(__m128i * initOffset, const __m128i *in, + uint32_t key, uint32_t *presult) +{ + int i = 0; + __m128i InReg = _mm_loadu_si128(in); + __m128i out; + __m128i tmp; + __m128i mask = _mm_set1_epi32((1U<<13)-1); + __m128i conversion = _mm_set1_epi32(2147483648U); + __m128i key4 = _mm_set1_epi32(key - 2147483648U); + + + tmp = InReg; + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,13); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,26); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 13-7), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,7); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,20); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 13-1), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,1); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,14); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,27); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 13-8), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,8); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,21); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 13-2), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,2); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,15); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,28); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 13-9), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,9); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,22); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 13-3), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,3); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,16); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,29); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 13-10), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,10); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,23); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 13-4), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,4); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,17); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,30); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 13-11), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,11); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,24); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 13-5), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,5); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,18); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,31); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 13-12), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,12); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,25); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 13-6), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,6); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,19); + out = tmp; + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + *presult = key + 1; + return (128); +} + +static int +iunpacksearch14(__m128i * initOffset, const __m128i *in, + uint32_t key, uint32_t *presult) +{ + int i = 0; + __m128i InReg = _mm_loadu_si128(in); + __m128i out; + __m128i tmp; + __m128i mask = _mm_set1_epi32((1U<<14)-1); + __m128i conversion = _mm_set1_epi32(2147483648U); + __m128i key4 = _mm_set1_epi32(key - 2147483648U); + + + tmp = InReg; + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,14); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,28); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 14-10), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,10); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,24); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 14-6), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,6); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,20); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 14-2), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,2); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,16); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,30); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 14-12), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,12); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,26); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 14-8), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,8); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,22); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 14-4), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,4); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,18); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = InReg; + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,14); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,28); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 14-10), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,10); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,24); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 14-6), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,6); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,20); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 14-2), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,2); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,16); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,30); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 14-12), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,12); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,26); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 14-8), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,8); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,22); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 14-4), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,4); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,18); + out = tmp; + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + *presult = key + 1; + return (128); +} + +static int +iunpacksearch15(__m128i * initOffset, const __m128i *in, + uint32_t key, uint32_t *presult) +{ + int i = 0; + __m128i InReg = _mm_loadu_si128(in); + __m128i out; + __m128i tmp; + __m128i mask = _mm_set1_epi32((1U<<15)-1); + __m128i conversion = _mm_set1_epi32(2147483648U); + __m128i key4 = _mm_set1_epi32(key - 2147483648U); + + + tmp = InReg; + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,15); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,30); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 15-13), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,13); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,28); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 15-11), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,11); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,26); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 15-9), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,9); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,24); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 15-7), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,7); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,22); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 15-5), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,5); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,20); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 15-3), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,3); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,18); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 15-1), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,1); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,16); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,31); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 15-14), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,14); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,29); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 15-12), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,12); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,27); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 15-10), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,10); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,25); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 15-8), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,8); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,23); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 15-6), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,6); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,21); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 15-4), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,4); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,19); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 15-2), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,2); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,17); + out = tmp; + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + *presult = key + 1; + return (128); +} + +static int +iunpacksearch16(__m128i * initOffset, const __m128i *in, + uint32_t key, uint32_t *presult) +{ + int i = 0; + __m128i InReg = _mm_loadu_si128(in); + __m128i out; + __m128i tmp; + __m128i mask = _mm_set1_epi32((1U<<16)-1); + __m128i conversion = _mm_set1_epi32(2147483648U); + __m128i key4 = _mm_set1_epi32(key - 2147483648U); + + + tmp = InReg; + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,16); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = InReg; + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,16); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = InReg; + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,16); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = InReg; + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,16); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = InReg; + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,16); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = InReg; + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,16); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = InReg; + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,16); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = InReg; + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,16); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = InReg; + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,16); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = InReg; + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,16); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = InReg; + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,16); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = InReg; + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,16); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = InReg; + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,16); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = InReg; + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,16); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = InReg; + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,16); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = InReg; + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,16); + out = tmp; + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + *presult = key + 1; + return (128); +} + +static int +iunpacksearch17(__m128i * initOffset, const __m128i *in, + uint32_t key, uint32_t *presult) +{ + int i = 0; + __m128i InReg = _mm_loadu_si128(in); + __m128i out; + __m128i tmp; + __m128i mask = _mm_set1_epi32((1U<<17)-1); + __m128i conversion = _mm_set1_epi32(2147483648U); + __m128i key4 = _mm_set1_epi32(key - 2147483648U); + + + tmp = InReg; + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,17); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 17-2), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,2); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,19); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 17-4), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,4); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,21); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 17-6), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,6); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,23); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 17-8), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,8); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,25); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 17-10), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,10); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,27); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 17-12), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,12); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,29); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 17-14), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,14); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,31); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 17-16), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,16); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 17-1), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,1); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,18); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 17-3), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,3); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,20); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 17-5), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,5); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,22); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 17-7), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,7); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,24); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 17-9), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,9); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,26); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 17-11), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,11); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,28); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 17-13), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,13); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,30); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 17-15), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,15); + out = tmp; + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + *presult = key + 1; + return (128); +} + +static int +iunpacksearch18(__m128i * initOffset, const __m128i *in, + uint32_t key, uint32_t *presult) +{ + int i = 0; + __m128i InReg = _mm_loadu_si128(in); + __m128i out; + __m128i tmp; + __m128i mask = _mm_set1_epi32((1U<<18)-1); + __m128i conversion = _mm_set1_epi32(2147483648U); + __m128i key4 = _mm_set1_epi32(key - 2147483648U); + + + tmp = InReg; + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,18); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 18-4), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,4); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,22); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 18-8), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,8); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,26); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 18-12), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,12); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,30); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 18-16), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,16); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 18-2), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,2); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,20); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 18-6), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,6); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,24); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 18-10), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,10); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,28); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 18-14), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,14); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = InReg; + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,18); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 18-4), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,4); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,22); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 18-8), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,8); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,26); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 18-12), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,12); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,30); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 18-16), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,16); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 18-2), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,2); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,20); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 18-6), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,6); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,24); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 18-10), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,10); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,28); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 18-14), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,14); + out = tmp; + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + *presult = key + 1; + return (128); +} + +static int +iunpacksearch19(__m128i * initOffset, const __m128i *in, + uint32_t key, uint32_t *presult) +{ + int i = 0; + __m128i InReg = _mm_loadu_si128(in); + __m128i out; + __m128i tmp; + __m128i mask = _mm_set1_epi32((1U<<19)-1); + __m128i conversion = _mm_set1_epi32(2147483648U); + __m128i key4 = _mm_set1_epi32(key - 2147483648U); + + + tmp = InReg; + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,19); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 19-6), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,6); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,25); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 19-12), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,12); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,31); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 19-18), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,18); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 19-5), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,5); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,24); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 19-11), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,11); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,30); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 19-17), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,17); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 19-4), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,4); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,23); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 19-10), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,10); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,29); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 19-16), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,16); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 19-3), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,3); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,22); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 19-9), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,9); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,28); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 19-15), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,15); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 19-2), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,2); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,21); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 19-8), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,8); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,27); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 19-14), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,14); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 19-1), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,1); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,20); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 19-7), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,7); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,26); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 19-13), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,13); + out = tmp; + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + *presult = key + 1; + return (128); +} + +static int +iunpacksearch20(__m128i * initOffset, const __m128i *in, + uint32_t key, uint32_t *presult) +{ + int i = 0; + __m128i InReg = _mm_loadu_si128(in); + __m128i out; + __m128i tmp; + __m128i mask = _mm_set1_epi32((1U<<20)-1); + __m128i conversion = _mm_set1_epi32(2147483648U); + __m128i key4 = _mm_set1_epi32(key - 2147483648U); + + + tmp = InReg; + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,20); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 20-8), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,8); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,28); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 20-16), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,16); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 20-4), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,4); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,24); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 20-12), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,12); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = InReg; + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,20); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 20-8), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,8); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,28); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 20-16), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,16); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 20-4), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,4); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,24); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 20-12), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,12); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = InReg; + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,20); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 20-8), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,8); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,28); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 20-16), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,16); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 20-4), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,4); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,24); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 20-12), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,12); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = InReg; + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,20); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 20-8), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,8); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,28); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 20-16), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,16); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 20-4), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,4); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,24); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 20-12), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,12); + out = tmp; + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + *presult = key + 1; + return (128); +} + +static int +iunpacksearch21(__m128i * initOffset, const __m128i *in, + uint32_t key, uint32_t *presult) +{ + int i = 0; + __m128i InReg = _mm_loadu_si128(in); + __m128i out; + __m128i tmp; + __m128i mask = _mm_set1_epi32((1U<<21)-1); + __m128i conversion = _mm_set1_epi32(2147483648U); + __m128i key4 = _mm_set1_epi32(key - 2147483648U); + + + tmp = InReg; + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,21); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 21-10), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,10); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,31); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 21-20), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,20); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 21-9), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,9); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,30); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 21-19), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,19); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 21-8), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,8); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,29); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 21-18), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,18); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 21-7), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,7); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,28); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 21-17), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,17); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 21-6), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,6); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,27); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 21-16), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,16); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 21-5), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,5); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,26); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 21-15), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,15); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 21-4), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,4); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,25); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 21-14), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,14); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 21-3), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,3); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,24); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 21-13), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,13); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 21-2), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,2); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,23); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 21-12), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,12); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 21-1), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,1); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,22); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 21-11), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,11); + out = tmp; + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + *presult = key + 1; + return (128); +} + +static int +iunpacksearch22(__m128i * initOffset, const __m128i *in, + uint32_t key, uint32_t *presult) +{ + int i = 0; + __m128i InReg = _mm_loadu_si128(in); + __m128i out; + __m128i tmp; + __m128i mask = _mm_set1_epi32((1U<<22)-1); + __m128i conversion = _mm_set1_epi32(2147483648U); + __m128i key4 = _mm_set1_epi32(key - 2147483648U); + + + tmp = InReg; + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,22); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 22-12), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,12); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 22-2), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,2); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,24); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 22-14), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,14); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 22-4), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,4); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,26); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 22-16), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,16); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 22-6), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,6); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,28); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 22-18), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,18); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 22-8), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,8); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,30); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 22-20), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,20); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 22-10), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,10); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = InReg; + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,22); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 22-12), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,12); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 22-2), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,2); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,24); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 22-14), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,14); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 22-4), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,4); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,26); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 22-16), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,16); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 22-6), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,6); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,28); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 22-18), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,18); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 22-8), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,8); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,30); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 22-20), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,20); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 22-10), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,10); + out = tmp; + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + *presult = key + 1; + return (128); +} + +static int +iunpacksearch23(__m128i * initOffset, const __m128i *in, + uint32_t key, uint32_t *presult) +{ + int i = 0; + __m128i InReg = _mm_loadu_si128(in); + __m128i out; + __m128i tmp; + __m128i mask = _mm_set1_epi32((1U<<23)-1); + __m128i conversion = _mm_set1_epi32(2147483648U); + __m128i key4 = _mm_set1_epi32(key - 2147483648U); + + + tmp = InReg; + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,23); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 23-14), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,14); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 23-5), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,5); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,28); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 23-19), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,19); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 23-10), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,10); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 23-1), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,1); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,24); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 23-15), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,15); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 23-6), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,6); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,29); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 23-20), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,20); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 23-11), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,11); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 23-2), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,2); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,25); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 23-16), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,16); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 23-7), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,7); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,30); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 23-21), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,21); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 23-12), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,12); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 23-3), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,3); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,26); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 23-17), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,17); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 23-8), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,8); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,31); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 23-22), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,22); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 23-13), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,13); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 23-4), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,4); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,27); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 23-18), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,18); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 23-9), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,9); + out = tmp; + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + *presult = key + 1; + return (128); +} + +static int +iunpacksearch24(__m128i * initOffset, const __m128i *in, + uint32_t key, uint32_t *presult) +{ + int i = 0; + __m128i InReg = _mm_loadu_si128(in); + __m128i out; + __m128i tmp; + __m128i mask = _mm_set1_epi32((1U<<24)-1); + __m128i conversion = _mm_set1_epi32(2147483648U); + __m128i key4 = _mm_set1_epi32(key - 2147483648U); + + + tmp = InReg; + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,24); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 24-16), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,16); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 24-8), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,8); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = InReg; + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,24); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 24-16), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,16); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 24-8), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,8); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = InReg; + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,24); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 24-16), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,16); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 24-8), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,8); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = InReg; + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,24); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 24-16), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,16); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 24-8), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,8); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = InReg; + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,24); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 24-16), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,16); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 24-8), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,8); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = InReg; + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,24); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 24-16), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,16); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 24-8), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,8); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = InReg; + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,24); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 24-16), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,16); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 24-8), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,8); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = InReg; + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,24); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 24-16), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,16); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 24-8), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,8); + out = tmp; + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + *presult = key + 1; + return (128); +} + +static int +iunpacksearch25(__m128i * initOffset, const __m128i *in, + uint32_t key, uint32_t *presult) +{ + int i = 0; + __m128i InReg = _mm_loadu_si128(in); + __m128i out; + __m128i tmp; + __m128i mask = _mm_set1_epi32((1U<<25)-1); + __m128i conversion = _mm_set1_epi32(2147483648U); + __m128i key4 = _mm_set1_epi32(key - 2147483648U); + + + tmp = InReg; + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,25); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 25-18), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,18); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 25-11), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,11); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 25-4), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,4); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,29); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 25-22), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,22); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 25-15), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,15); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 25-8), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,8); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 25-1), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,1); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,26); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 25-19), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,19); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 25-12), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,12); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 25-5), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,5); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,30); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 25-23), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,23); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 25-16), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,16); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 25-9), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,9); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 25-2), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,2); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,27); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 25-20), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,20); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 25-13), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,13); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 25-6), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,6); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,31); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 25-24), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,24); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 25-17), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,17); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 25-10), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,10); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 25-3), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,3); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,28); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 25-21), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,21); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 25-14), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,14); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 25-7), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,7); + out = tmp; + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + *presult = key + 1; + return (128); +} + +static int +iunpacksearch26(__m128i * initOffset, const __m128i *in, + uint32_t key, uint32_t *presult) +{ + int i = 0; + __m128i InReg = _mm_loadu_si128(in); + __m128i out; + __m128i tmp; + __m128i mask = _mm_set1_epi32((1U<<26)-1); + __m128i conversion = _mm_set1_epi32(2147483648U); + __m128i key4 = _mm_set1_epi32(key - 2147483648U); + + + tmp = InReg; + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,26); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 26-20), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,20); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 26-14), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,14); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 26-8), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,8); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 26-2), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,2); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,28); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 26-22), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,22); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 26-16), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,16); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 26-10), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,10); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 26-4), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,4); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,30); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 26-24), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,24); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 26-18), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,18); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 26-12), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,12); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 26-6), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,6); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = InReg; + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,26); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 26-20), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,20); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 26-14), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,14); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 26-8), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,8); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 26-2), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,2); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,28); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 26-22), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,22); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 26-16), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,16); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 26-10), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,10); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 26-4), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,4); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,30); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 26-24), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,24); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 26-18), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,18); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 26-12), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,12); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 26-6), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,6); + out = tmp; + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + *presult = key + 1; + return (128); +} + +static int +iunpacksearch27(__m128i * initOffset, const __m128i *in, + uint32_t key, uint32_t *presult) +{ + int i = 0; + __m128i InReg = _mm_loadu_si128(in); + __m128i out; + __m128i tmp; + __m128i mask = _mm_set1_epi32((1U<<27)-1); + __m128i conversion = _mm_set1_epi32(2147483648U); + __m128i key4 = _mm_set1_epi32(key - 2147483648U); + + + tmp = InReg; + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,27); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 27-22), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,22); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 27-17), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,17); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 27-12), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,12); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 27-7), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,7); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 27-2), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,2); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,29); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 27-24), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,24); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 27-19), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,19); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 27-14), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,14); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 27-9), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,9); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 27-4), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,4); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,31); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 27-26), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,26); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 27-21), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,21); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 27-16), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,16); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 27-11), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,11); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 27-6), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,6); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 27-1), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,1); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,28); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 27-23), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,23); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 27-18), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,18); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 27-13), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,13); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 27-8), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,8); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 27-3), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,3); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,30); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 27-25), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,25); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 27-20), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,20); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 27-15), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,15); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 27-10), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,10); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 27-5), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,5); + out = tmp; + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + *presult = key + 1; + return (128); +} + +static int +iunpacksearch28(__m128i * initOffset, const __m128i *in, + uint32_t key, uint32_t *presult) +{ + int i = 0; + __m128i InReg = _mm_loadu_si128(in); + __m128i out; + __m128i tmp; + __m128i mask = _mm_set1_epi32((1U<<28)-1); + __m128i conversion = _mm_set1_epi32(2147483648U); + __m128i key4 = _mm_set1_epi32(key - 2147483648U); + + + tmp = InReg; + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,28); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 28-24), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,24); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 28-20), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,20); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 28-16), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,16); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 28-12), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,12); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 28-8), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,8); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 28-4), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,4); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = InReg; + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,28); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 28-24), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,24); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 28-20), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,20); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 28-16), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,16); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 28-12), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,12); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 28-8), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,8); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 28-4), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,4); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = InReg; + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,28); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 28-24), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,24); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 28-20), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,20); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 28-16), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,16); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 28-12), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,12); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 28-8), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,8); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 28-4), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,4); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = InReg; + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,28); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 28-24), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,24); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 28-20), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,20); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 28-16), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,16); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 28-12), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,12); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 28-8), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,8); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 28-4), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,4); + out = tmp; + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + *presult = key + 1; + return (128); +} + +static int +iunpacksearch29(__m128i * initOffset, const __m128i *in, + uint32_t key, uint32_t *presult) +{ + int i = 0; + __m128i InReg = _mm_loadu_si128(in); + __m128i out; + __m128i tmp; + __m128i mask = _mm_set1_epi32((1U<<29)-1); + __m128i conversion = _mm_set1_epi32(2147483648U); + __m128i key4 = _mm_set1_epi32(key - 2147483648U); + + + tmp = InReg; + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,29); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 29-26), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,26); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 29-23), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,23); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 29-20), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,20); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 29-17), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,17); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 29-14), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,14); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 29-11), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,11); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 29-8), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,8); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 29-5), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,5); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 29-2), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,2); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,31); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 29-28), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,28); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 29-25), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,25); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 29-22), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,22); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 29-19), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,19); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 29-16), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,16); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 29-13), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,13); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 29-10), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,10); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 29-7), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,7); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 29-4), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,4); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 29-1), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,1); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,30); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 29-27), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,27); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 29-24), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,24); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 29-21), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,21); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 29-18), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,18); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 29-15), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,15); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 29-12), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,12); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 29-9), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,9); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 29-6), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,6); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 29-3), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,3); + out = tmp; + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + *presult = key + 1; + return (128); +} + +static int +iunpacksearch30(__m128i * initOffset, const __m128i *in, + uint32_t key, uint32_t *presult) +{ + int i = 0; + __m128i InReg = _mm_loadu_si128(in); + __m128i out; + __m128i tmp; + __m128i mask = _mm_set1_epi32((1U<<30)-1); + __m128i conversion = _mm_set1_epi32(2147483648U); + __m128i key4 = _mm_set1_epi32(key - 2147483648U); + + tmp = InReg; + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,30); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 30-28), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,28); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 30-26), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,26); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 30-24), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,24); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 30-22), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,22); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 30-20), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,20); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 30-18), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,18); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 30-16), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,16); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 30-14), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,14); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 30-12), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,12); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 30-10), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,10); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 30-8), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,8); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 30-6), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,6); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 30-4), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,4); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 30-2), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,2); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = InReg; + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,30); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 30-28), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,28); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 30-26), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,26); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 30-24), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,24); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 30-22), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,22); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 30-20), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,20); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 30-18), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,18); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 30-16), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,16); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 30-14), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,14); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 30-12), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,12); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 30-10), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,10); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 30-8), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,8); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 30-6), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,6); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 30-4), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,4); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 30-2), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,2); + out = tmp; + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + *presult = key + 1; + return (128); +} + +static int +iunpacksearch31(__m128i * initOffset, const __m128i *in, + uint32_t key, uint32_t *presult) +{ + int i = 0; + __m128i InReg = _mm_loadu_si128(in); + __m128i out; + __m128i tmp; + __m128i mask = _mm_set1_epi32((1U<<31)-1); + __m128i conversion = _mm_set1_epi32(2147483648U); + __m128i key4 = _mm_set1_epi32(key - 2147483648U); + + + tmp = InReg; + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,31); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 31-30), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,30); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 31-29), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,29); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 31-28), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,28); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 31-27), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,27); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 31-26), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,26); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 31-25), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,25); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 31-24), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,24); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 31-23), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,23); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 31-22), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,22); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 31-21), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,21); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 31-20), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,20); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 31-19), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,19); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 31-18), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,18); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 31-17), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,17); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 31-16), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,16); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 31-15), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,15); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 31-14), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,14); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 31-13), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,13); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 31-12), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,12); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 31-11), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,11); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 31-10), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,10); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 31-9), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,9); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 31-8), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,8); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 31-7), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,7); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 31-6), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,6); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 31-5), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,5); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 31-4), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,4); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 31-3), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,3); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 31-2), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,2); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 31-1), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + tmp = _mm_srli_epi32(InReg,1); + out = tmp; + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, key, presult); + + *presult = key + 1; + return (128); +} + +static int +iunpacksearch32(__m128i * initOffset, const __m128i *in, + uint32_t key, uint32_t *presult) +{ + uint32_t * in32 = (uint32_t *)in; + int answer = lower_bound(in32, key, 0, 128); + if(in32[answer] < key) { + *presult = key + 1; + return (128); + } + *presult = in32[answer]; + *initOffset = _mm_load_si128(in + 31); + return answer; + +} + + +int +simdsearchd1(__m128i * initOffset, const __m128i *in, uint32_t bit, + uint32_t key, uint32_t *presult) +{ + switch (bit) { + case 0: return iunpacksearch0(initOffset, in, key, presult); + + case 1: return iunpacksearch1(initOffset, in, key, presult); + + case 2: return iunpacksearch2(initOffset, in, key, presult); + + case 3: return iunpacksearch3(initOffset, in, key, presult); + + case 4: return iunpacksearch4(initOffset, in, key, presult); + + case 5: return iunpacksearch5(initOffset, in, key, presult); + + case 6: return iunpacksearch6(initOffset, in, key, presult); + + case 7: return iunpacksearch7(initOffset, in, key, presult); + + case 8: return iunpacksearch8(initOffset, in, key, presult); + + case 9: return iunpacksearch9(initOffset, in, key, presult); + + case 10: return iunpacksearch10(initOffset, in, key, presult); + + case 11: return iunpacksearch11(initOffset, in, key, presult); + + case 12: return iunpacksearch12(initOffset, in, key, presult); + + case 13: return iunpacksearch13(initOffset, in, key, presult); + + case 14: return iunpacksearch14(initOffset, in, key, presult); + + case 15: return iunpacksearch15(initOffset, in, key, presult); + + case 16: return iunpacksearch16(initOffset, in, key, presult); + + case 17: return iunpacksearch17(initOffset, in, key, presult); + + case 18: return iunpacksearch18(initOffset, in, key, presult); + + case 19: return iunpacksearch19(initOffset, in, key, presult); + + case 20: return iunpacksearch20(initOffset, in, key, presult); + + case 21: return iunpacksearch21(initOffset, in, key, presult); + + case 22: return iunpacksearch22(initOffset, in, key, presult); + + case 23: return iunpacksearch23(initOffset, in, key, presult); + + case 24: return iunpacksearch24(initOffset, in, key, presult); + + case 25: return iunpacksearch25(initOffset, in, key, presult); + + case 26: return iunpacksearch26(initOffset, in, key, presult); + + case 27: return iunpacksearch27(initOffset, in, key, presult); + + case 28: return iunpacksearch28(initOffset, in, key, presult); + + case 29: return iunpacksearch29(initOffset, in, key, presult); + + case 30: return iunpacksearch30(initOffset, in, key, presult); + + case 31: return iunpacksearch31(initOffset, in, key, presult); + + case 32: return iunpacksearch32(initOffset, in, key, presult); + + default: break; + } + return (-1); +} + +#endif diff --git a/src/simdpackedselect.c b/src/simdpackedselect.c new file mode 100644 index 000000000..09898f1ff --- /dev/null +++ b/src/simdpackedselect.c @@ -0,0 +1,15490 @@ +/** + * This code is released under a BSD License. + */ +#ifdef __SSE4_1__ +#include "simdintegratedbitpacking.h" +#include + + +SIMDCOMP_ALIGNED(16) int8_t shuffle_mask_bytes[256] = { + 0,1,2,3,0,0,0,0,0,0,0,0,0,0,0,0, + 4,5,6,7,0,0,0,0,0,0,0,0,0,0,0,0, + 8,9,10,11,0,0,0,0,0,0,0,0,0,0,0,0, + 12,13,14,15,0,0,0,0,0,0,0,0,0,0,0,0, + }; + +static const __m128i *shuffle_mask = (__m128i *) shuffle_mask_bytes; + +uint32_t branchlessextract (__m128i out, int i) { + return _mm_cvtsi128_si32(_mm_shuffle_epi8(out,shuffle_mask[i])); +} + +#define PrefixSum(ret, curr, prev) do { \ + const __m128i _tmp1 = _mm_add_epi32(_mm_slli_si128(curr, 8), curr); \ + const __m128i _tmp2 = _mm_add_epi32(_mm_slli_si128(_tmp1, 4), _tmp1); \ + ret = _mm_add_epi32(_tmp2, _mm_shuffle_epi32(prev, 0xff)); \ + } while (0) + +#define CHECK_AND_INCREMENT(i, out, slot) \ + i += 4; \ + if (i > slot) { \ + return branchlessextract (out, slot - (i - 4)); \ + } + + +static uint32_t +iunpackselect1(__m128i * initOffset, const __m128i *in, int slot) +{ + int i = 0; + __m128i InReg = _mm_loadu_si128(in); + __m128i out; + __m128i tmp; + __m128i mask = _mm_set1_epi32((1U<<1)-1); + + tmp = InReg; + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,1); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,2); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,3); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,4); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,5); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,6); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,7); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,8); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,9); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,10); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,11); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,12); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,13); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,14); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,15); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,16); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,17); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,18); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,19); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,20); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,21); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,22); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,23); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,24); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,25); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,26); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,27); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,28); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,29); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,30); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,31); + out = tmp; + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + + return (0); +} + +static uint32_t +iunpackselect2(__m128i * initOffset, const __m128i *in, int slot) +{ + int i = 0; + __m128i InReg = _mm_loadu_si128(in); + __m128i out; + __m128i tmp; + __m128i mask = _mm_set1_epi32((1U<<2)-1); + + tmp = InReg; + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,2); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,4); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,6); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,8); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,10); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,12); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,14); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,16); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,18); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,20); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,22); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,24); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,26); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,28); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,30); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = InReg; + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,2); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,4); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,6); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,8); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,10); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,12); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,14); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,16); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,18); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,20); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,22); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,24); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,26); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,28); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,30); + out = tmp; + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + + return (0); +} + +static uint32_t +iunpackselect3(__m128i * initOffset, const __m128i *in, int slot) +{ + int i = 0; + __m128i InReg = _mm_loadu_si128(in); + __m128i out; + __m128i tmp; + __m128i mask = _mm_set1_epi32((1U<<3)-1); + + tmp = InReg; + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,3); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,6); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,9); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,12); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,15); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,18); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,21); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,24); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,27); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,30); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 3-1), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,1); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,4); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,7); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,10); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,13); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,16); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,19); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,22); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,25); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,28); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,31); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 3-2), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,2); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,5); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,8); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,11); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,14); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,17); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,20); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,23); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,26); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,29); + out = tmp; + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + + return (0); +} + +static uint32_t +iunpackselect4(__m128i * initOffset, const __m128i *in, int slot) +{ + int i = 0; + __m128i InReg = _mm_loadu_si128(in); + __m128i out; + __m128i tmp; + __m128i mask = _mm_set1_epi32((1U<<4)-1); + + tmp = InReg; + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,4); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,8); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,12); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,16); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,20); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,24); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,28); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = InReg; + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,4); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,8); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,12); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,16); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,20); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,24); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,28); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = InReg; + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,4); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,8); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,12); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,16); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,20); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,24); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,28); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = InReg; + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,4); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,8); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,12); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,16); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,20); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,24); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,28); + out = tmp; + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + + return (0); +} + +static uint32_t +iunpackselect5(__m128i * initOffset, const __m128i *in, int slot) +{ + int i = 0; + __m128i InReg = _mm_loadu_si128(in); + __m128i out; + __m128i tmp; + __m128i mask = _mm_set1_epi32((1U<<5)-1); + + tmp = InReg; + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,5); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,10); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,15); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,20); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,25); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,30); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 5-3), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,3); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,8); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,13); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,18); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,23); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,28); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 5-1), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,1); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,6); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,11); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,16); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,21); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,26); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,31); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 5-4), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,4); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,9); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,14); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,19); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,24); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,29); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 5-2), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,2); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,7); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,12); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,17); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,22); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,27); + out = tmp; + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + + return (0); +} + +static uint32_t +iunpackselect6(__m128i * initOffset, const __m128i *in, int slot) +{ + int i = 0; + __m128i InReg = _mm_loadu_si128(in); + __m128i out; + __m128i tmp; + __m128i mask = _mm_set1_epi32((1U<<6)-1); + + tmp = InReg; + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,6); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,12); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,18); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,24); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,30); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 6-4), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,4); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,10); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,16); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,22); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,28); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 6-2), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,2); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,8); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,14); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,20); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,26); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = InReg; + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,6); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,12); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,18); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,24); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,30); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 6-4), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,4); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,10); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,16); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,22); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,28); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 6-2), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,2); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,8); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,14); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,20); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,26); + out = tmp; + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + + return (0); +} + +static uint32_t +iunpackselect7(__m128i * initOffset, const __m128i *in, int slot) +{ + int i = 0; + __m128i InReg = _mm_loadu_si128(in); + __m128i out; + __m128i tmp; + __m128i mask = _mm_set1_epi32((1U<<7)-1); + + tmp = InReg; + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,7); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,14); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,21); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,28); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 7-3), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,3); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,10); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,17); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,24); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,31); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 7-6), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,6); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,13); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,20); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,27); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 7-2), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,2); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,9); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,16); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,23); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,30); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 7-5), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,5); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,12); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,19); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,26); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 7-1), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,1); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,8); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,15); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,22); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,29); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 7-4), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,4); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,11); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,18); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,25); + out = tmp; + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + + return (0); +} + +static uint32_t +iunpackselect8(__m128i * initOffset, const __m128i *in, int slot) +{ + int i = 0; + __m128i InReg = _mm_loadu_si128(in); + __m128i out; + __m128i tmp; + __m128i mask = _mm_set1_epi32((1U<<8)-1); + + tmp = InReg; + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,8); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,16); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,24); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = InReg; + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,8); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,16); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,24); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = InReg; + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,8); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,16); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,24); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = InReg; + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,8); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,16); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,24); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = InReg; + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,8); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,16); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,24); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = InReg; + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,8); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,16); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,24); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = InReg; + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,8); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,16); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,24); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = InReg; + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,8); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,16); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,24); + out = tmp; + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + + return (0); +} + +static uint32_t +iunpackselect9(__m128i * initOffset, const __m128i *in, int slot) +{ + int i = 0; + __m128i InReg = _mm_loadu_si128(in); + __m128i out; + __m128i tmp; + __m128i mask = _mm_set1_epi32((1U<<9)-1); + + tmp = InReg; + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,9); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,18); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,27); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 9-4), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,4); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,13); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,22); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,31); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 9-8), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,8); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,17); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,26); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 9-3), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,3); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,12); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,21); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,30); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 9-7), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,7); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,16); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,25); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 9-2), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,2); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,11); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,20); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,29); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 9-6), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,6); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,15); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,24); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 9-1), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,1); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,10); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,19); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,28); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 9-5), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,5); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,14); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,23); + out = tmp; + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + + return (0); +} + +static uint32_t +iunpackselect10(__m128i * initOffset, const __m128i *in, int slot) +{ + int i = 0; + __m128i InReg = _mm_loadu_si128(in); + __m128i out; + __m128i tmp; + __m128i mask = _mm_set1_epi32((1U<<10)-1); + + tmp = InReg; + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,10); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,20); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,30); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 10-8), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,8); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,18); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,28); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 10-6), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,6); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,16); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,26); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 10-4), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,4); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,14); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,24); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 10-2), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,2); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,12); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,22); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = InReg; + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,10); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,20); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,30); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 10-8), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,8); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,18); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,28); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 10-6), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,6); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,16); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,26); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 10-4), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,4); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,14); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,24); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 10-2), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,2); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,12); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,22); + out = tmp; + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + + return (0); +} + +static uint32_t +iunpackselect11(__m128i * initOffset, const __m128i *in, int slot) +{ + int i = 0; + __m128i InReg = _mm_loadu_si128(in); + __m128i out; + __m128i tmp; + __m128i mask = _mm_set1_epi32((1U<<11)-1); + + tmp = InReg; + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,11); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,22); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 11-1), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,1); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,12); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,23); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 11-2), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,2); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,13); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,24); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 11-3), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,3); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,14); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,25); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 11-4), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,4); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,15); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,26); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 11-5), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,5); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,16); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,27); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 11-6), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,6); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,17); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,28); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 11-7), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,7); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,18); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,29); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 11-8), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,8); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,19); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,30); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 11-9), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,9); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,20); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,31); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 11-10), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,10); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,21); + out = tmp; + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + + return (0); +} + +static uint32_t +iunpackselect12(__m128i * initOffset, const __m128i *in, int slot) +{ + int i = 0; + __m128i InReg = _mm_loadu_si128(in); + __m128i out; + __m128i tmp; + __m128i mask = _mm_set1_epi32((1U<<12)-1); + + tmp = InReg; + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,12); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,24); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 12-4), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,4); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,16); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,28); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 12-8), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,8); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,20); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = InReg; + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,12); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,24); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 12-4), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,4); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,16); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,28); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 12-8), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,8); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,20); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = InReg; + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,12); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,24); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 12-4), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,4); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,16); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,28); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 12-8), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,8); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,20); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = InReg; + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,12); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,24); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 12-4), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,4); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,16); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,28); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 12-8), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,8); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,20); + out = tmp; + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + + return (0); +} + +static uint32_t +iunpackselect13(__m128i * initOffset, const __m128i *in, int slot) +{ + int i = 0; + __m128i InReg = _mm_loadu_si128(in); + __m128i out; + __m128i tmp; + __m128i mask = _mm_set1_epi32((1U<<13)-1); + + tmp = InReg; + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,13); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,26); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 13-7), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,7); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,20); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 13-1), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,1); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,14); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,27); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 13-8), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,8); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,21); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 13-2), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,2); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,15); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,28); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 13-9), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,9); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,22); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 13-3), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,3); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,16); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,29); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 13-10), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,10); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,23); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 13-4), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,4); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,17); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,30); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 13-11), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,11); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,24); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 13-5), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,5); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,18); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,31); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 13-12), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,12); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,25); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 13-6), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,6); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,19); + out = tmp; + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + + return (0); +} + +static uint32_t +iunpackselect14(__m128i * initOffset, const __m128i *in, int slot) +{ + int i = 0; + __m128i InReg = _mm_loadu_si128(in); + __m128i out; + __m128i tmp; + __m128i mask = _mm_set1_epi32((1U<<14)-1); + + tmp = InReg; + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,14); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,28); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 14-10), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,10); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,24); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 14-6), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,6); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,20); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 14-2), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,2); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,16); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,30); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 14-12), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,12); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,26); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 14-8), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,8); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,22); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 14-4), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,4); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,18); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = InReg; + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,14); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,28); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 14-10), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,10); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,24); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 14-6), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,6); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,20); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 14-2), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,2); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,16); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,30); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 14-12), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,12); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,26); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 14-8), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,8); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,22); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 14-4), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,4); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,18); + out = tmp; + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + + return (0); +} + +static uint32_t +iunpackselect15(__m128i * initOffset, const __m128i *in, int slot) +{ + int i = 0; + __m128i InReg = _mm_loadu_si128(in); + __m128i out; + __m128i tmp; + __m128i mask = _mm_set1_epi32((1U<<15)-1); + + tmp = InReg; + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,15); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,30); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 15-13), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,13); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,28); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 15-11), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,11); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,26); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 15-9), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,9); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,24); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 15-7), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,7); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,22); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 15-5), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,5); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,20); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 15-3), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,3); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,18); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 15-1), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,1); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,16); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,31); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 15-14), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,14); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,29); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 15-12), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,12); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,27); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 15-10), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,10); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,25); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 15-8), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,8); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,23); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 15-6), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,6); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,21); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 15-4), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,4); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,19); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 15-2), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,2); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,17); + out = tmp; + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + + return (0); +} + +static uint32_t +iunpackselect16(__m128i * initOffset, const __m128i *in, int slot) +{ + int i = 0; + __m128i InReg = _mm_loadu_si128(in); + __m128i out; + __m128i tmp; + __m128i mask = _mm_set1_epi32((1U<<16)-1); + + tmp = InReg; + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,16); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = InReg; + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,16); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = InReg; + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,16); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = InReg; + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,16); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = InReg; + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,16); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = InReg; + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,16); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = InReg; + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,16); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = InReg; + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,16); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = InReg; + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,16); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = InReg; + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,16); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = InReg; + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,16); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = InReg; + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,16); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = InReg; + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,16); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = InReg; + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,16); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = InReg; + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,16); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = InReg; + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,16); + out = tmp; + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + + return (0); +} + +static uint32_t +iunpackselect17(__m128i * initOffset, const __m128i *in, int slot) +{ + int i = 0; + __m128i InReg = _mm_loadu_si128(in); + __m128i out; + __m128i tmp; + __m128i mask = _mm_set1_epi32((1U<<17)-1); + + tmp = InReg; + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,17); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 17-2), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,2); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,19); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 17-4), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,4); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,21); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 17-6), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,6); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,23); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 17-8), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,8); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,25); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 17-10), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,10); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,27); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 17-12), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,12); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,29); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 17-14), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,14); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,31); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 17-16), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,16); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 17-1), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,1); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,18); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 17-3), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,3); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,20); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 17-5), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,5); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,22); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 17-7), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,7); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,24); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 17-9), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,9); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,26); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 17-11), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,11); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,28); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 17-13), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,13); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,30); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 17-15), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,15); + out = tmp; + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + + return (0); +} + +static uint32_t +iunpackselect18(__m128i * initOffset, const __m128i *in, int slot) +{ + int i = 0; + __m128i InReg = _mm_loadu_si128(in); + __m128i out; + __m128i tmp; + __m128i mask = _mm_set1_epi32((1U<<18)-1); + + tmp = InReg; + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,18); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 18-4), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,4); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,22); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 18-8), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,8); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,26); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 18-12), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,12); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,30); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 18-16), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,16); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 18-2), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,2); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,20); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 18-6), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,6); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,24); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 18-10), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,10); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,28); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 18-14), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,14); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = InReg; + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,18); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 18-4), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,4); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,22); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 18-8), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,8); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,26); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 18-12), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,12); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,30); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 18-16), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,16); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 18-2), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,2); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,20); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 18-6), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,6); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,24); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 18-10), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,10); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,28); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 18-14), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,14); + out = tmp; + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + + return (0); +} + +static uint32_t +iunpackselect19(__m128i * initOffset, const __m128i *in, int slot) +{ + int i = 0; + __m128i InReg = _mm_loadu_si128(in); + __m128i out; + __m128i tmp; + __m128i mask = _mm_set1_epi32((1U<<19)-1); + + tmp = InReg; + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,19); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 19-6), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,6); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,25); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 19-12), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,12); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,31); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 19-18), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,18); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 19-5), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,5); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,24); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 19-11), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,11); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,30); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 19-17), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,17); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 19-4), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,4); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,23); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 19-10), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,10); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,29); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 19-16), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,16); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 19-3), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,3); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,22); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 19-9), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,9); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,28); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 19-15), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,15); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 19-2), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,2); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,21); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 19-8), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,8); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,27); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 19-14), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,14); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 19-1), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,1); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,20); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 19-7), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,7); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,26); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 19-13), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,13); + out = tmp; + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + + return (0); +} + +static uint32_t +iunpackselect20(__m128i * initOffset, const __m128i *in, int slot) +{ + int i = 0; + __m128i InReg = _mm_loadu_si128(in); + __m128i out; + __m128i tmp; + __m128i mask = _mm_set1_epi32((1U<<20)-1); + + tmp = InReg; + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,20); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 20-8), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,8); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,28); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 20-16), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,16); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 20-4), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,4); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,24); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 20-12), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,12); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = InReg; + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,20); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 20-8), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,8); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,28); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 20-16), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,16); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 20-4), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,4); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,24); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 20-12), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,12); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = InReg; + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,20); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 20-8), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,8); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,28); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 20-16), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,16); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 20-4), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,4); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,24); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 20-12), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,12); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = InReg; + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,20); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 20-8), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,8); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,28); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 20-16), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,16); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 20-4), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,4); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,24); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 20-12), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,12); + out = tmp; + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + + return (0); +} + +static uint32_t +iunpackselect21(__m128i * initOffset, const __m128i *in, int slot) +{ + int i = 0; + __m128i InReg = _mm_loadu_si128(in); + __m128i out; + __m128i tmp; + __m128i mask = _mm_set1_epi32((1U<<21)-1); + + tmp = InReg; + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,21); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 21-10), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,10); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,31); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 21-20), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,20); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 21-9), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,9); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,30); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 21-19), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,19); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 21-8), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,8); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,29); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 21-18), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,18); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 21-7), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,7); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,28); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 21-17), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,17); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 21-6), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,6); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,27); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 21-16), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,16); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 21-5), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,5); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,26); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 21-15), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,15); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 21-4), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,4); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,25); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 21-14), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,14); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 21-3), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,3); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,24); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 21-13), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,13); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 21-2), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,2); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,23); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 21-12), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,12); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 21-1), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,1); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,22); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 21-11), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,11); + out = tmp; + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + + return (0); +} + +static uint32_t +iunpackselect22(__m128i * initOffset, const __m128i *in, int slot) +{ + int i = 0; + __m128i InReg = _mm_loadu_si128(in); + __m128i out; + __m128i tmp; + __m128i mask = _mm_set1_epi32((1U<<22)-1); + + tmp = InReg; + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,22); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 22-12), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,12); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 22-2), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,2); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,24); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 22-14), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,14); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 22-4), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,4); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,26); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 22-16), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,16); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 22-6), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,6); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,28); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 22-18), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,18); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 22-8), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,8); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,30); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 22-20), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,20); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 22-10), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,10); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = InReg; + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,22); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 22-12), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,12); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 22-2), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,2); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,24); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 22-14), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,14); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 22-4), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,4); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,26); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 22-16), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,16); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 22-6), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,6); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,28); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 22-18), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,18); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 22-8), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,8); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,30); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 22-20), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,20); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 22-10), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,10); + out = tmp; + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + + return (0); +} + +static uint32_t +iunpackselect23(__m128i * initOffset, const __m128i *in, int slot) +{ + int i = 0; + __m128i InReg = _mm_loadu_si128(in); + __m128i out; + __m128i tmp; + __m128i mask = _mm_set1_epi32((1U<<23)-1); + + tmp = InReg; + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,23); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 23-14), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,14); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 23-5), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,5); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,28); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 23-19), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,19); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 23-10), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,10); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 23-1), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,1); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,24); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 23-15), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,15); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 23-6), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,6); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,29); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 23-20), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,20); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 23-11), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,11); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 23-2), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,2); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,25); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 23-16), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,16); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 23-7), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,7); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,30); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 23-21), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,21); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 23-12), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,12); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 23-3), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,3); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,26); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 23-17), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,17); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 23-8), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,8); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,31); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 23-22), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,22); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 23-13), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,13); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 23-4), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,4); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,27); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 23-18), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,18); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 23-9), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,9); + out = tmp; + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + + return (0); +} + +static uint32_t +iunpackselect24(__m128i * initOffset, const __m128i *in, int slot) +{ + int i = 0; + __m128i InReg = _mm_loadu_si128(in); + __m128i out; + __m128i tmp; + __m128i mask = _mm_set1_epi32((1U<<24)-1); + + tmp = InReg; + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,24); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 24-16), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,16); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 24-8), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,8); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = InReg; + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,24); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 24-16), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,16); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 24-8), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,8); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = InReg; + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,24); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 24-16), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,16); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 24-8), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,8); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = InReg; + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,24); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 24-16), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,16); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 24-8), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,8); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = InReg; + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,24); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 24-16), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,16); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 24-8), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,8); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = InReg; + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,24); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 24-16), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,16); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 24-8), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,8); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = InReg; + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,24); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 24-16), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,16); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 24-8), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,8); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = InReg; + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,24); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 24-16), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,16); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 24-8), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,8); + out = tmp; + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + + return (0); +} + +static uint32_t +iunpackselect25(__m128i * initOffset, const __m128i *in, int slot) +{ + int i = 0; + __m128i InReg = _mm_loadu_si128(in); + __m128i out; + __m128i tmp; + __m128i mask = _mm_set1_epi32((1U<<25)-1); + + tmp = InReg; + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,25); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 25-18), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,18); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 25-11), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,11); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 25-4), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,4); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,29); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 25-22), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,22); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 25-15), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,15); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 25-8), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,8); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 25-1), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,1); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,26); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 25-19), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,19); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 25-12), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,12); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 25-5), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,5); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,30); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 25-23), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,23); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 25-16), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,16); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 25-9), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,9); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 25-2), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,2); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,27); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 25-20), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,20); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 25-13), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,13); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 25-6), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,6); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,31); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 25-24), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,24); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 25-17), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,17); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 25-10), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,10); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 25-3), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,3); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,28); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 25-21), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,21); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 25-14), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,14); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 25-7), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,7); + out = tmp; + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + + return (0); +} + +static uint32_t +iunpackselect26(__m128i * initOffset, const __m128i *in, int slot) +{ + int i = 0; + __m128i InReg = _mm_loadu_si128(in); + __m128i out; + __m128i tmp; + __m128i mask = _mm_set1_epi32((1U<<26)-1); + + tmp = InReg; + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,26); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 26-20), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,20); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 26-14), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,14); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 26-8), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,8); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 26-2), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,2); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,28); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 26-22), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,22); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 26-16), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,16); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 26-10), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,10); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 26-4), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,4); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,30); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 26-24), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,24); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 26-18), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,18); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 26-12), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,12); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 26-6), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,6); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = InReg; + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,26); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 26-20), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,20); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 26-14), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,14); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 26-8), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,8); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 26-2), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,2); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,28); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 26-22), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,22); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 26-16), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,16); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 26-10), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,10); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 26-4), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,4); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,30); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 26-24), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,24); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 26-18), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,18); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 26-12), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,12); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 26-6), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,6); + out = tmp; + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + + return (0); +} + +static uint32_t +iunpackselect27(__m128i * initOffset, const __m128i *in, int slot) +{ + int i = 0; + __m128i InReg = _mm_loadu_si128(in); + __m128i out; + __m128i tmp; + __m128i mask = _mm_set1_epi32((1U<<27)-1); + + tmp = InReg; + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,27); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 27-22), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,22); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 27-17), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,17); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 27-12), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,12); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 27-7), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,7); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 27-2), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,2); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,29); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 27-24), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,24); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 27-19), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,19); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 27-14), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,14); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 27-9), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,9); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 27-4), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,4); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,31); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 27-26), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,26); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 27-21), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,21); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 27-16), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,16); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 27-11), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,11); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 27-6), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,6); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 27-1), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,1); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,28); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 27-23), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,23); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 27-18), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,18); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 27-13), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,13); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 27-8), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,8); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 27-3), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,3); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,30); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 27-25), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,25); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 27-20), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,20); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 27-15), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,15); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 27-10), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,10); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 27-5), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,5); + out = tmp; + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + + return (0); +} + +static uint32_t +iunpackselect28(__m128i * initOffset, const __m128i *in, int slot) +{ + int i = 0; + __m128i InReg = _mm_loadu_si128(in); + __m128i out; + __m128i tmp; + __m128i mask = _mm_set1_epi32((1U<<28)-1); + + tmp = InReg; + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,28); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 28-24), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,24); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 28-20), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,20); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 28-16), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,16); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 28-12), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,12); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 28-8), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,8); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 28-4), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,4); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = InReg; + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,28); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 28-24), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,24); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 28-20), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,20); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 28-16), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,16); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 28-12), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,12); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 28-8), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,8); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 28-4), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,4); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = InReg; + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,28); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 28-24), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,24); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 28-20), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,20); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 28-16), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,16); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 28-12), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,12); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 28-8), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,8); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 28-4), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,4); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = InReg; + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,28); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 28-24), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,24); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 28-20), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,20); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 28-16), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,16); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 28-12), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,12); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 28-8), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,8); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 28-4), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,4); + out = tmp; + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + + return (0); +} + +static uint32_t +iunpackselect29(__m128i * initOffset, const __m128i *in, int slot) +{ + int i = 0; + __m128i InReg = _mm_loadu_si128(in); + __m128i out; + __m128i tmp; + __m128i mask = _mm_set1_epi32((1U<<29)-1); + + tmp = InReg; + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,29); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 29-26), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,26); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 29-23), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,23); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 29-20), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,20); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 29-17), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,17); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 29-14), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,14); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 29-11), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,11); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 29-8), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,8); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 29-5), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,5); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 29-2), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,2); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,31); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 29-28), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,28); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 29-25), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,25); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 29-22), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,22); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 29-19), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,19); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 29-16), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,16); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 29-13), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,13); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 29-10), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,10); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 29-7), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,7); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 29-4), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,4); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 29-1), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,1); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,30); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 29-27), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,27); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 29-24), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,24); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 29-21), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,21); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 29-18), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,18); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 29-15), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,15); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 29-12), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,12); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 29-9), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,9); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 29-6), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,6); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 29-3), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,3); + out = tmp; + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + + return (0); +} + +static uint32_t +iunpackselect30(__m128i * initOffset, const __m128i *in, int slot) +{ + int i = 0; + __m128i InReg = _mm_loadu_si128(in); + __m128i out; + __m128i tmp; + __m128i mask = _mm_set1_epi32((1U<<30)-1); + + tmp = InReg; + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,30); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 30-28), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,28); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 30-26), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,26); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 30-24), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,24); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 30-22), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,22); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 30-20), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,20); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 30-18), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,18); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 30-16), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,16); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 30-14), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,14); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 30-12), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,12); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 30-10), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,10); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 30-8), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,8); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 30-6), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,6); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 30-4), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,4); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 30-2), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,2); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = InReg; + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,30); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 30-28), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,28); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 30-26), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,26); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 30-24), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,24); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 30-22), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,22); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 30-20), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,20); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 30-18), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,18); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 30-16), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,16); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 30-14), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,14); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 30-12), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,12); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 30-10), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,10); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 30-8), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,8); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 30-6), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,6); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 30-4), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,4); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 30-2), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,2); + out = tmp; + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + + return (0); +} + +static uint32_t +iunpackselect31(__m128i * initOffset, const __m128i *in, int slot) +{ + int i = 0; + __m128i InReg = _mm_loadu_si128(in); + __m128i out; + __m128i tmp; + __m128i mask = _mm_set1_epi32((1U<<31)-1); + + tmp = InReg; + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,31); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 31-30), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,30); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 31-29), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,29); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 31-28), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,28); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 31-27), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,27); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 31-26), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,26); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 31-25), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,25); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 31-24), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,24); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 31-23), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,23); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 31-22), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,22); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 31-21), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,21); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 31-20), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,20); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 31-19), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,19); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 31-18), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,18); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 31-17), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,17); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 31-16), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,16); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 31-15), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,15); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 31-14), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,14); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 31-13), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,13); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 31-12), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,12); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 31-11), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,11); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 31-10), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,10); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 31-9), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,9); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 31-8), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,8); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 31-7), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,7); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 31-6), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,6); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 31-5), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,5); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 31-4), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,4); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 31-3), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,3); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 31-2), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,2); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 31-1), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + tmp = _mm_srli_epi32(InReg,1); + out = tmp; + PrefixSum(out, out, *initOffset); + *initOffset = out; + CHECK_AND_INCREMENT(i, out, slot); + + + return (0); +} + +static uint32_t +iunpackselect32(__m128i * initOffset , const __m128i *in, int slot) +{ + uint32_t *begin = (uint32_t *)in; + *initOffset = _mm_load_si128(in + 31); + return begin[slot]; +} + + +uint32_t +simdselectd1(uint32_t init, const __m128i *in, uint32_t bit, int slot) +{ + __m128i vecinitOffset = _mm_set1_epi32(init); + __m128i * initOffset = &vecinitOffset; + slot &= 127; /* to avoid problems */ + + switch (bit) { + case 0: return _mm_extract_epi32(*initOffset,3); break; + + case 1: return iunpackselect1(initOffset, in, slot); break; + + case 2: return iunpackselect2(initOffset, in, slot); break; + + case 3: return iunpackselect3(initOffset, in, slot); break; + + case 4: return iunpackselect4(initOffset, in, slot); break; + + case 5: return iunpackselect5(initOffset, in, slot); break; + + case 6: return iunpackselect6(initOffset, in, slot); break; + + case 7: return iunpackselect7(initOffset, in, slot); break; + + case 8: return iunpackselect8(initOffset, in, slot); break; + + case 9: return iunpackselect9(initOffset, in, slot); break; + + case 10: return iunpackselect10(initOffset, in, slot); break; + + case 11: return iunpackselect11(initOffset, in, slot); break; + + case 12: return iunpackselect12(initOffset, in, slot); break; + + case 13: return iunpackselect13(initOffset, in, slot); break; + + case 14: return iunpackselect14(initOffset, in, slot); break; + + case 15: return iunpackselect15(initOffset, in, slot); break; + + case 16: return iunpackselect16(initOffset, in, slot); break; + + case 17: return iunpackselect17(initOffset, in, slot); break; + + case 18: return iunpackselect18(initOffset, in, slot); break; + + case 19: return iunpackselect19(initOffset, in, slot); break; + + case 20: return iunpackselect20(initOffset, in, slot); break; + + case 21: return iunpackselect21(initOffset, in, slot); break; + + case 22: return iunpackselect22(initOffset, in, slot); break; + + case 23: return iunpackselect23(initOffset, in, slot); break; + + case 24: return iunpackselect24(initOffset, in, slot); break; + + case 25: return iunpackselect25(initOffset, in, slot); break; + + case 26: return iunpackselect26(initOffset, in, slot); break; + + case 27: return iunpackselect27(initOffset, in, slot); break; + + case 28: return iunpackselect28(initOffset, in, slot); break; + + case 29: return iunpackselect29(initOffset, in, slot); break; + + case 30: return iunpackselect30(initOffset, in, slot); break; + + case 31: return iunpackselect31(initOffset, in, slot); break; + + case 32: return iunpackselect32(initOffset, in, slot); break; + + default: break; + } + + return (-1); +} + +static void +iunpackscan1(__m128i * initOffset, const __m128i *in) +{ + + __m128i InReg = _mm_loadu_si128(in); + __m128i out; + __m128i tmp; + __m128i mask = _mm_set1_epi32((1U<<1)-1); + + tmp = InReg; + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,1); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,2); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,3); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,4); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,5); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,6); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,7); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,8); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,9); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,10); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,11); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,12); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,13); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,14); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,15); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,16); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,17); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,18); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,19); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,20); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,21); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,22); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,23); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,24); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,25); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,26); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,27); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,28); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,29); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,30); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,31); + out = tmp; + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + +} + +static void +iunpackscan2(__m128i * initOffset, const __m128i *in) +{ + + __m128i InReg = _mm_loadu_si128(in); + __m128i out; + __m128i tmp; + __m128i mask = _mm_set1_epi32((1U<<2)-1); + + tmp = InReg; + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,2); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,4); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,6); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,8); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,10); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,12); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,14); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,16); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,18); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,20); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,22); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,24); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,26); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,28); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,30); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = InReg; + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,2); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,4); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,6); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,8); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,10); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,12); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,14); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,16); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,18); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,20); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,22); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,24); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,26); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,28); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,30); + out = tmp; + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + +} + +static void +iunpackscan3(__m128i * initOffset, const __m128i *in) +{ + + __m128i InReg = _mm_loadu_si128(in); + __m128i out; + __m128i tmp; + __m128i mask = _mm_set1_epi32((1U<<3)-1); + + tmp = InReg; + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,3); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,6); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,9); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,12); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,15); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,18); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,21); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,24); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,27); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,30); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 3-1), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,1); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,4); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,7); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,10); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,13); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,16); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,19); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,22); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,25); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,28); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,31); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 3-2), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,2); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,5); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,8); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,11); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,14); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,17); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,20); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,23); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,26); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,29); + out = tmp; + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + +} + +static void +iunpackscan4(__m128i * initOffset, const __m128i *in) +{ + + __m128i InReg = _mm_loadu_si128(in); + __m128i out; + __m128i tmp; + __m128i mask = _mm_set1_epi32((1U<<4)-1); + + tmp = InReg; + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,4); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,8); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,12); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,16); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,20); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,24); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,28); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = InReg; + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,4); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,8); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,12); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,16); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,20); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,24); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,28); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = InReg; + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,4); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,8); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,12); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,16); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,20); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,24); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,28); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = InReg; + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,4); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,8); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,12); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,16); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,20); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,24); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,28); + out = tmp; + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + +} + +static void +iunpackscan5(__m128i * initOffset, const __m128i *in) +{ + + __m128i InReg = _mm_loadu_si128(in); + __m128i out; + __m128i tmp; + __m128i mask = _mm_set1_epi32((1U<<5)-1); + + tmp = InReg; + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,5); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,10); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,15); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,20); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,25); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,30); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 5-3), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,3); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,8); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,13); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,18); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,23); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,28); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 5-1), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,1); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,6); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,11); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,16); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,21); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,26); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,31); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 5-4), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,4); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,9); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,14); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,19); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,24); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,29); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 5-2), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,2); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,7); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,12); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,17); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,22); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,27); + out = tmp; + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + +} + +static void +iunpackscan6(__m128i * initOffset, const __m128i *in) +{ + + __m128i InReg = _mm_loadu_si128(in); + __m128i out; + __m128i tmp; + __m128i mask = _mm_set1_epi32((1U<<6)-1); + + tmp = InReg; + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,6); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,12); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,18); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,24); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,30); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 6-4), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,4); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,10); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,16); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,22); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,28); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 6-2), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,2); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,8); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,14); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,20); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,26); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = InReg; + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,6); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,12); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,18); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,24); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,30); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 6-4), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,4); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,10); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,16); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,22); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,28); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 6-2), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,2); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,8); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,14); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,20); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,26); + out = tmp; + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + +} + +static void +iunpackscan7(__m128i * initOffset, const __m128i *in) +{ + + __m128i InReg = _mm_loadu_si128(in); + __m128i out; + __m128i tmp; + __m128i mask = _mm_set1_epi32((1U<<7)-1); + + tmp = InReg; + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,7); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,14); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,21); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,28); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 7-3), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,3); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,10); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,17); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,24); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,31); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 7-6), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,6); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,13); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,20); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,27); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 7-2), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,2); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,9); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,16); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,23); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,30); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 7-5), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,5); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,12); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,19); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,26); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 7-1), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,1); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,8); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,15); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,22); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,29); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 7-4), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,4); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,11); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,18); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,25); + out = tmp; + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + +} + +static void +iunpackscan8(__m128i * initOffset, const __m128i *in) +{ + + __m128i InReg = _mm_loadu_si128(in); + __m128i out; + __m128i tmp; + __m128i mask = _mm_set1_epi32((1U<<8)-1); + + tmp = InReg; + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,8); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,16); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,24); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = InReg; + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,8); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,16); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,24); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = InReg; + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,8); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,16); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,24); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = InReg; + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,8); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,16); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,24); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = InReg; + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,8); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,16); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,24); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = InReg; + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,8); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,16); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,24); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = InReg; + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,8); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,16); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,24); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = InReg; + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,8); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,16); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,24); + out = tmp; + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + +} + +static void +iunpackscan9(__m128i * initOffset, const __m128i *in) +{ + + __m128i InReg = _mm_loadu_si128(in); + __m128i out; + __m128i tmp; + __m128i mask = _mm_set1_epi32((1U<<9)-1); + + tmp = InReg; + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,9); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,18); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,27); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 9-4), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,4); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,13); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,22); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,31); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 9-8), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,8); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,17); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,26); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 9-3), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,3); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,12); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,21); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,30); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 9-7), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,7); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,16); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,25); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 9-2), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,2); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,11); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,20); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,29); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 9-6), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,6); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,15); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,24); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 9-1), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,1); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,10); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,19); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,28); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 9-5), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,5); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,14); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,23); + out = tmp; + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + +} + +static void +iunpackscan10(__m128i * initOffset, const __m128i *in) +{ + + __m128i InReg = _mm_loadu_si128(in); + __m128i out; + __m128i tmp; + __m128i mask = _mm_set1_epi32((1U<<10)-1); + + tmp = InReg; + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,10); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,20); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,30); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 10-8), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,8); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,18); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,28); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 10-6), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,6); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,16); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,26); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 10-4), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,4); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,14); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,24); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 10-2), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,2); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,12); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,22); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = InReg; + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,10); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,20); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,30); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 10-8), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,8); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,18); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,28); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 10-6), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,6); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,16); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,26); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 10-4), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,4); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,14); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,24); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 10-2), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,2); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,12); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,22); + out = tmp; + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + +} + +static void +iunpackscan11(__m128i * initOffset, const __m128i *in) +{ + + __m128i InReg = _mm_loadu_si128(in); + __m128i out; + __m128i tmp; + __m128i mask = _mm_set1_epi32((1U<<11)-1); + + tmp = InReg; + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,11); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,22); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 11-1), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,1); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,12); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,23); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 11-2), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,2); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,13); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,24); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 11-3), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,3); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,14); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,25); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 11-4), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,4); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,15); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,26); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 11-5), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,5); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,16); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,27); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 11-6), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,6); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,17); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,28); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 11-7), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,7); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,18); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,29); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 11-8), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,8); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,19); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,30); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 11-9), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,9); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,20); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,31); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 11-10), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,10); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,21); + out = tmp; + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + +} + +static void +iunpackscan12(__m128i * initOffset, const __m128i *in) +{ + + __m128i InReg = _mm_loadu_si128(in); + __m128i out; + __m128i tmp; + __m128i mask = _mm_set1_epi32((1U<<12)-1); + + tmp = InReg; + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,12); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,24); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 12-4), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,4); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,16); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,28); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 12-8), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,8); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,20); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = InReg; + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,12); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,24); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 12-4), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,4); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,16); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,28); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 12-8), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,8); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,20); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = InReg; + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,12); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,24); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 12-4), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,4); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,16); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,28); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 12-8), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,8); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,20); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = InReg; + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,12); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,24); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 12-4), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,4); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,16); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,28); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 12-8), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,8); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,20); + out = tmp; + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + +} + +static void +iunpackscan13(__m128i * initOffset, const __m128i *in) +{ + + __m128i InReg = _mm_loadu_si128(in); + __m128i out; + __m128i tmp; + __m128i mask = _mm_set1_epi32((1U<<13)-1); + + tmp = InReg; + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,13); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,26); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 13-7), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,7); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,20); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 13-1), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,1); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,14); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,27); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 13-8), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,8); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,21); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 13-2), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,2); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,15); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,28); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 13-9), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,9); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,22); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 13-3), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,3); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,16); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,29); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 13-10), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,10); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,23); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 13-4), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,4); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,17); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,30); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 13-11), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,11); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,24); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 13-5), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,5); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,18); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,31); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 13-12), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,12); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,25); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 13-6), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,6); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,19); + out = tmp; + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + +} + +static void +iunpackscan14(__m128i * initOffset, const __m128i *in) +{ + + __m128i InReg = _mm_loadu_si128(in); + __m128i out; + __m128i tmp; + __m128i mask = _mm_set1_epi32((1U<<14)-1); + + tmp = InReg; + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,14); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,28); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 14-10), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,10); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,24); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 14-6), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,6); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,20); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 14-2), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,2); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,16); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,30); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 14-12), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,12); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,26); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 14-8), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,8); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,22); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 14-4), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,4); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,18); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = InReg; + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,14); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,28); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 14-10), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,10); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,24); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 14-6), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,6); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,20); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 14-2), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,2); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,16); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,30); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 14-12), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,12); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,26); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 14-8), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,8); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,22); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 14-4), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,4); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,18); + out = tmp; + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + +} + +static void +iunpackscan15(__m128i * initOffset, const __m128i *in) +{ + + __m128i InReg = _mm_loadu_si128(in); + __m128i out; + __m128i tmp; + __m128i mask = _mm_set1_epi32((1U<<15)-1); + + tmp = InReg; + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,15); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,30); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 15-13), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,13); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,28); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 15-11), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,11); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,26); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 15-9), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,9); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,24); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 15-7), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,7); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,22); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 15-5), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,5); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,20); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 15-3), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,3); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,18); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 15-1), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,1); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,16); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,31); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 15-14), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,14); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,29); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 15-12), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,12); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,27); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 15-10), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,10); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,25); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 15-8), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,8); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,23); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 15-6), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,6); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,21); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 15-4), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,4); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,19); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 15-2), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,2); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,17); + out = tmp; + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + +} + +static void +iunpackscan16(__m128i * initOffset, const __m128i *in) +{ + + __m128i InReg = _mm_loadu_si128(in); + __m128i out; + __m128i tmp; + __m128i mask = _mm_set1_epi32((1U<<16)-1); + + tmp = InReg; + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,16); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = InReg; + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,16); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = InReg; + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,16); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = InReg; + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,16); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = InReg; + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,16); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = InReg; + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,16); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = InReg; + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,16); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = InReg; + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,16); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = InReg; + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,16); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = InReg; + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,16); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = InReg; + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,16); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = InReg; + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,16); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = InReg; + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,16); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = InReg; + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,16); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = InReg; + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,16); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = InReg; + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,16); + out = tmp; + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + +} + +static void +iunpackscan17(__m128i * initOffset, const __m128i *in) +{ + + __m128i InReg = _mm_loadu_si128(in); + __m128i out; + __m128i tmp; + __m128i mask = _mm_set1_epi32((1U<<17)-1); + + tmp = InReg; + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,17); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 17-2), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,2); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,19); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 17-4), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,4); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,21); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 17-6), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,6); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,23); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 17-8), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,8); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,25); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 17-10), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,10); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,27); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 17-12), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,12); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,29); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 17-14), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,14); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,31); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 17-16), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,16); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 17-1), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,1); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,18); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 17-3), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,3); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,20); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 17-5), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,5); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,22); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 17-7), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,7); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,24); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 17-9), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,9); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,26); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 17-11), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,11); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,28); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 17-13), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,13); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,30); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 17-15), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,15); + out = tmp; + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + +} + +static void +iunpackscan18(__m128i * initOffset, const __m128i *in) +{ + + __m128i InReg = _mm_loadu_si128(in); + __m128i out; + __m128i tmp; + __m128i mask = _mm_set1_epi32((1U<<18)-1); + + tmp = InReg; + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,18); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 18-4), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,4); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,22); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 18-8), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,8); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,26); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 18-12), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,12); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,30); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 18-16), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,16); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 18-2), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,2); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,20); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 18-6), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,6); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,24); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 18-10), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,10); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,28); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 18-14), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,14); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = InReg; + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,18); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 18-4), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,4); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,22); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 18-8), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,8); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,26); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 18-12), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,12); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,30); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 18-16), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,16); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 18-2), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,2); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,20); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 18-6), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,6); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,24); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 18-10), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,10); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,28); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 18-14), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,14); + out = tmp; + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + +} + +static void +iunpackscan19(__m128i * initOffset, const __m128i *in) +{ + + __m128i InReg = _mm_loadu_si128(in); + __m128i out; + __m128i tmp; + __m128i mask = _mm_set1_epi32((1U<<19)-1); + + tmp = InReg; + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,19); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 19-6), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,6); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,25); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 19-12), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,12); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,31); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 19-18), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,18); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 19-5), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,5); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,24); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 19-11), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,11); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,30); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 19-17), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,17); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 19-4), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,4); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,23); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 19-10), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,10); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,29); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 19-16), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,16); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 19-3), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,3); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,22); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 19-9), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,9); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,28); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 19-15), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,15); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 19-2), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,2); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,21); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 19-8), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,8); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,27); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 19-14), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,14); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 19-1), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,1); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,20); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 19-7), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,7); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,26); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 19-13), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,13); + out = tmp; + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + +} + +static void +iunpackscan20(__m128i * initOffset, const __m128i *in) +{ + + __m128i InReg = _mm_loadu_si128(in); + __m128i out; + __m128i tmp; + __m128i mask = _mm_set1_epi32((1U<<20)-1); + + tmp = InReg; + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,20); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 20-8), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,8); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,28); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 20-16), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,16); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 20-4), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,4); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,24); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 20-12), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,12); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = InReg; + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,20); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 20-8), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,8); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,28); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 20-16), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,16); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 20-4), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,4); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,24); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 20-12), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,12); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = InReg; + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,20); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 20-8), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,8); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,28); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 20-16), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,16); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 20-4), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,4); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,24); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 20-12), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,12); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = InReg; + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,20); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 20-8), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,8); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,28); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 20-16), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,16); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 20-4), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,4); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,24); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 20-12), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,12); + out = tmp; + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + + +} + +static void +iunpackscan21(__m128i * initOffset, const __m128i *in) +{ + + __m128i InReg = _mm_loadu_si128(in); + __m128i out; + __m128i tmp; + __m128i mask = _mm_set1_epi32((1U<<21)-1); + + tmp = InReg; + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,21); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 21-10), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,10); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,31); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 21-20), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,20); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 21-9), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,9); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,30); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 21-19), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,19); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 21-8), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,8); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,29); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 21-18), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,18); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 21-7), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,7); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,28); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 21-17), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,17); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 21-6), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,6); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,27); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 21-16), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,16); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 21-5), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,5); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,26); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 21-15), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,15); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 21-4), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,4); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,25); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 21-14), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,14); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 21-3), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,3); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,24); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 21-13), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,13); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 21-2), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,2); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,23); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 21-12), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,12); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 21-1), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,1); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,22); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 21-11), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,11); + out = tmp; + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + + +} + +static void +iunpackscan22(__m128i * initOffset, const __m128i *in) +{ + + __m128i InReg = _mm_loadu_si128(in); + __m128i out; + __m128i tmp; + __m128i mask = _mm_set1_epi32((1U<<22)-1); + + tmp = InReg; + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,22); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 22-12), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,12); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 22-2), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,2); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,24); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 22-14), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,14); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 22-4), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,4); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,26); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 22-16), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,16); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 22-6), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,6); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,28); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 22-18), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,18); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 22-8), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,8); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,30); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 22-20), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,20); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 22-10), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,10); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = InReg; + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,22); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 22-12), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,12); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 22-2), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,2); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,24); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 22-14), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,14); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 22-4), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,4); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,26); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 22-16), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,16); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 22-6), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,6); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,28); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 22-18), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,18); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 22-8), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,8); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,30); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 22-20), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,20); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 22-10), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,10); + out = tmp; + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + + +} + +static void +iunpackscan23(__m128i * initOffset, const __m128i *in) +{ + + __m128i InReg = _mm_loadu_si128(in); + __m128i out; + __m128i tmp; + __m128i mask = _mm_set1_epi32((1U<<23)-1); + + tmp = InReg; + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,23); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 23-14), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,14); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 23-5), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,5); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,28); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 23-19), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,19); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 23-10), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,10); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 23-1), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,1); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,24); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 23-15), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,15); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 23-6), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,6); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,29); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 23-20), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,20); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 23-11), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,11); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 23-2), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,2); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,25); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 23-16), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,16); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 23-7), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,7); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,30); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 23-21), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,21); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 23-12), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,12); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 23-3), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,3); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,26); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 23-17), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,17); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 23-8), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,8); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,31); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 23-22), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,22); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 23-13), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,13); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 23-4), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,4); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,27); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 23-18), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,18); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 23-9), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,9); + out = tmp; + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + + +} + +static void +iunpackscan24(__m128i * initOffset, const __m128i *in) +{ + + __m128i InReg = _mm_loadu_si128(in); + __m128i out; + __m128i tmp; + __m128i mask = _mm_set1_epi32((1U<<24)-1); + + tmp = InReg; + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,24); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 24-16), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,16); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 24-8), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,8); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = InReg; + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,24); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 24-16), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,16); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 24-8), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,8); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = InReg; + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,24); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 24-16), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,16); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 24-8), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,8); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = InReg; + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,24); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 24-16), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,16); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 24-8), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,8); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = InReg; + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,24); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 24-16), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,16); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 24-8), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,8); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = InReg; + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,24); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 24-16), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,16); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 24-8), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,8); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = InReg; + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,24); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 24-16), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,16); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 24-8), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,8); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = InReg; + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,24); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 24-16), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,16); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 24-8), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,8); + out = tmp; + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + + +} + +static void +iunpackscan25(__m128i * initOffset, const __m128i *in) +{ + + __m128i InReg = _mm_loadu_si128(in); + __m128i out; + __m128i tmp; + __m128i mask = _mm_set1_epi32((1U<<25)-1); + + tmp = InReg; + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,25); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 25-18), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,18); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 25-11), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,11); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 25-4), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,4); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,29); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 25-22), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,22); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 25-15), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,15); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 25-8), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,8); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 25-1), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,1); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,26); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 25-19), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,19); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 25-12), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,12); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 25-5), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,5); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,30); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 25-23), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,23); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 25-16), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,16); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 25-9), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,9); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 25-2), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,2); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,27); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 25-20), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,20); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 25-13), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,13); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 25-6), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,6); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,31); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 25-24), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,24); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 25-17), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,17); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 25-10), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,10); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 25-3), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,3); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,28); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 25-21), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,21); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 25-14), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,14); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 25-7), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,7); + out = tmp; + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + + +} + +static void +iunpackscan26(__m128i * initOffset, const __m128i *in) +{ + + __m128i InReg = _mm_loadu_si128(in); + __m128i out; + __m128i tmp; + __m128i mask = _mm_set1_epi32((1U<<26)-1); + + tmp = InReg; + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,26); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 26-20), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,20); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 26-14), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,14); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 26-8), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,8); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 26-2), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,2); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,28); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 26-22), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,22); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 26-16), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,16); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 26-10), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,10); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 26-4), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,4); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,30); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 26-24), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,24); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 26-18), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,18); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 26-12), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,12); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 26-6), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,6); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = InReg; + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,26); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 26-20), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,20); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 26-14), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,14); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 26-8), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,8); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 26-2), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,2); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,28); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 26-22), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,22); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 26-16), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,16); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 26-10), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,10); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 26-4), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,4); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,30); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 26-24), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,24); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 26-18), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,18); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 26-12), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,12); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 26-6), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,6); + out = tmp; + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + + +} + +static void +iunpackscan27(__m128i * initOffset, const __m128i *in) +{ + + __m128i InReg = _mm_loadu_si128(in); + __m128i out; + __m128i tmp; + __m128i mask = _mm_set1_epi32((1U<<27)-1); + + tmp = InReg; + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,27); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 27-22), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,22); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 27-17), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,17); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 27-12), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,12); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 27-7), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,7); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 27-2), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,2); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,29); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 27-24), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,24); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 27-19), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,19); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 27-14), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,14); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 27-9), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,9); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 27-4), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,4); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,31); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 27-26), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,26); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 27-21), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,21); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 27-16), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,16); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 27-11), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,11); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 27-6), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,6); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 27-1), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,1); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,28); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 27-23), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,23); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 27-18), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,18); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 27-13), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,13); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 27-8), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,8); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 27-3), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,3); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,30); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 27-25), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,25); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 27-20), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,20); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 27-15), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,15); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 27-10), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,10); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 27-5), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,5); + out = tmp; + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + + +} + +static void +iunpackscan28(__m128i * initOffset, const __m128i *in) +{ + + __m128i InReg = _mm_loadu_si128(in); + __m128i out; + __m128i tmp; + __m128i mask = _mm_set1_epi32((1U<<28)-1); + + tmp = InReg; + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,28); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 28-24), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,24); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 28-20), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,20); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 28-16), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,16); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 28-12), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,12); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 28-8), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,8); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 28-4), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,4); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = InReg; + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,28); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 28-24), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,24); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 28-20), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,20); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 28-16), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,16); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 28-12), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,12); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 28-8), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,8); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 28-4), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,4); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = InReg; + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,28); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 28-24), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,24); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 28-20), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,20); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 28-16), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,16); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 28-12), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,12); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 28-8), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,8); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 28-4), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,4); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = InReg; + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,28); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 28-24), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,24); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 28-20), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,20); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 28-16), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,16); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 28-12), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,12); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 28-8), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,8); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 28-4), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,4); + out = tmp; + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + + +} + +static void +iunpackscan29(__m128i * initOffset, const __m128i *in) +{ + + __m128i InReg = _mm_loadu_si128(in); + __m128i out; + __m128i tmp; + __m128i mask = _mm_set1_epi32((1U<<29)-1); + + tmp = InReg; + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,29); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 29-26), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,26); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 29-23), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,23); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 29-20), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,20); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 29-17), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,17); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 29-14), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,14); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 29-11), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,11); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 29-8), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,8); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 29-5), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,5); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 29-2), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,2); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,31); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 29-28), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,28); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 29-25), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,25); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 29-22), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,22); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 29-19), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,19); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 29-16), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,16); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 29-13), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,13); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 29-10), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,10); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 29-7), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,7); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 29-4), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,4); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 29-1), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,1); + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,30); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 29-27), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,27); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 29-24), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,24); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 29-21), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,21); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 29-18), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,18); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 29-15), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,15); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 29-12), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,12); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 29-9), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,9); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 29-6), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,6); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 29-3), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,3); + out = tmp; + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + + +} + +static void +iunpackscan30(__m128i * initOffset, const __m128i *in) +{ + + __m128i InReg = _mm_loadu_si128(in); + __m128i out; + __m128i tmp; + __m128i mask = _mm_set1_epi32((1U<<30)-1); + + tmp = InReg; + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,30); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 30-28), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,28); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 30-26), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,26); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 30-24), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,24); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 30-22), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,22); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 30-20), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,20); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 30-18), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,18); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 30-16), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,16); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 30-14), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,14); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 30-12), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,12); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 30-10), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,10); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 30-8), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,8); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 30-6), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,6); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 30-4), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,4); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 30-2), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,2); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = InReg; + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,30); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 30-28), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,28); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 30-26), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,26); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 30-24), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,24); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 30-22), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,22); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 30-20), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,20); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 30-18), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,18); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 30-16), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,16); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 30-14), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,14); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 30-12), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,12); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 30-10), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,10); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 30-8), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,8); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 30-6), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,6); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 30-4), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,4); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 30-2), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,2); + out = tmp; + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + + +} + +static void +iunpackscan31(__m128i * initOffset, const __m128i *in) +{ + + __m128i InReg = _mm_loadu_si128(in); + __m128i out; + __m128i tmp; + __m128i mask = _mm_set1_epi32((1U<<31)-1); + + tmp = InReg; + out = _mm_and_si128(tmp, mask); + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,31); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 31-30), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,30); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 31-29), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,29); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 31-28), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,28); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 31-27), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,27); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 31-26), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,26); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 31-25), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,25); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 31-24), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,24); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 31-23), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,23); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 31-22), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,22); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 31-21), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,21); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 31-20), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,20); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 31-19), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,19); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 31-18), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,18); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 31-17), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,17); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 31-16), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,16); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 31-15), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,15); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 31-14), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,14); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 31-13), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,13); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 31-12), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,12); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 31-11), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,11); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 31-10), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,10); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 31-9), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,9); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 31-8), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,8); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 31-7), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,7); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 31-6), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,6); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 31-5), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,5); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 31-4), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,4); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 31-3), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,3); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 31-2), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,2); + out = tmp; + ++in; InReg = _mm_loadu_si128(in); + out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 31-1), mask)); + + PrefixSum(out, out, *initOffset); + *initOffset = out; + + + tmp = _mm_srli_epi32(InReg,1); + out = tmp; + PrefixSum(out, out, *initOffset); + *initOffset = out; + + +} + +static void +iunpackscan32(__m128i * initOffset , const __m128i *in) +{ + *initOffset = _mm_load_si128(in + 31); +} + + +void +simdscand1(__m128i * initOffset, const __m128i *in, uint32_t bit) +{ + switch (bit) { + case 0: return; break; + + case 1: iunpackscan1(initOffset, in); break; + + case 2: iunpackscan2(initOffset, in); break; + + case 3: iunpackscan3(initOffset, in); break; + + case 4: iunpackscan4(initOffset, in); break; + + case 5: iunpackscan5(initOffset, in); break; + + case 6: iunpackscan6(initOffset, in); break; + + case 7: iunpackscan7(initOffset, in); break; + + case 8: iunpackscan8(initOffset, in); break; + + case 9: iunpackscan9(initOffset, in); break; + + case 10: iunpackscan10(initOffset, in); break; + + case 11: iunpackscan11(initOffset, in); break; + + case 12: iunpackscan12(initOffset, in); break; + + case 13: iunpackscan13(initOffset, in); break; + + case 14: iunpackscan14(initOffset, in); break; + + case 15: iunpackscan15(initOffset, in); break; + + case 16: iunpackscan16(initOffset, in); break; + + case 17: iunpackscan17(initOffset, in); break; + + case 18: iunpackscan18(initOffset, in); break; + + case 19: iunpackscan19(initOffset, in); break; + + case 20: iunpackscan20(initOffset, in); break; + + case 21: iunpackscan21(initOffset, in); break; + + case 22: iunpackscan22(initOffset, in); break; + + case 23: iunpackscan23(initOffset, in); break; + + case 24: iunpackscan24(initOffset, in); break; + + case 25: iunpackscan25(initOffset, in); break; + + case 26: iunpackscan26(initOffset, in); break; + + case 27: iunpackscan27(initOffset, in); break; + + case 28: iunpackscan28(initOffset, in); break; + + case 29: iunpackscan29(initOffset, in); break; + + case 30: iunpackscan30(initOffset, in); break; + + case 31: iunpackscan31(initOffset, in); break; + + case 32: iunpackscan32(initOffset, in); break; + + default: break; + } + + return ; +} + +#endif diff --git a/tests/unit.c b/tests/unit.c new file mode 100644 index 000000000..341ef4e37 --- /dev/null +++ b/tests/unit.c @@ -0,0 +1,900 @@ +/** + * This code is released under a BSD License. + */ +#include +#include +#include +#include "simdcomp.h" + + + +int testshortpack() { + int bit; + size_t i; + size_t length; + __m128i * bb; + srand(0); + printf("testshortpack\n"); + for (bit = 0; bit < 32; ++bit) { + const size_t N = 128; + uint32_t * data = malloc(N * sizeof(uint32_t)); + uint32_t * backdata = malloc(N * sizeof(uint32_t)); + uint32_t * buffer = malloc((2 * N + 1024) * sizeof(uint32_t)); + + for (i = 0; i < N; ++i) { + data[i] = rand() & ((1 << bit) - 1); + } + for (length = 0; length <= N; ++length) { + for (i = 0; i < N; ++i) { + backdata[i] = 0; + } + bb = simdpack_shortlength(data, length, (__m128i *) buffer, + bit); + if((bb - (__m128i *) buffer) * sizeof(__m128i) != (unsigned) simdpack_compressedbytes(length,bit)) { + printf("bug\n"); + return -1; + } + simdunpack_shortlength((__m128i *) buffer, length, + backdata, bit); + for (i = 0; i < length; ++i) { + + if (data[i] != backdata[i]) { + printf("bug\n"); + return -1; + } + } + } + free(data); + free(backdata); + free(buffer); + } + return 0; +} + +int testlongpack() { + int bit; + size_t i; + size_t length; + __m128i * bb; + srand(0); + printf("testlongpack\n"); + for (bit = 0; bit < 32; ++bit) { + const size_t N = 2048; + uint32_t * data = malloc(N * sizeof(uint32_t)); + uint32_t * backdata = malloc(N * sizeof(uint32_t)); + uint32_t * buffer = malloc((2 * N + 1024) * sizeof(uint32_t)); + + for (i = 0; i < N; ++i) { + data[i] = rand() & ((1 << bit) - 1); + } + for (length = 0; length <= N; ++length) { + for (i = 0; i < N; ++i) { + backdata[i] = 0; + } + bb = simdpack_length(data, length, (__m128i *) buffer, + bit); + if((bb - (__m128i *) buffer) * sizeof(__m128i) != (unsigned) simdpack_compressedbytes(length,bit)) { + printf("bug\n"); + return -1; + } + simdunpack_length((__m128i *) buffer, length, + backdata, bit); + for (i = 0; i < length; ++i) { + + if (data[i] != backdata[i]) { + printf("bug\n"); + return -1; + } + } + } + free(data); + free(backdata); + free(buffer); + } + return 0; +} + + + +int testset() { + int bit; + size_t i; + const size_t N = 128; + uint32_t * data = malloc(N * sizeof(uint32_t)); + uint32_t * backdata = malloc(N * sizeof(uint32_t)); + uint32_t * buffer = malloc((2 * N + 1024) * sizeof(uint32_t)); + + srand(0); + + for (bit = 0; bit < 32; ++bit) { + printf("simple set %d \n",bit); + + for (i = 0; i < N; ++i) { + data[i] = rand() & ((1 << bit) - 1); + } + for (i = 0; i < N; ++i) { + backdata[i] = 0; + } + simdpack(data, (__m128i *) buffer, bit); + simdunpack((__m128i *) buffer, backdata, bit); + for (i = 0; i < N; ++i) { + if (data[i] != backdata[i]) { + printf("bug\n"); + return -1; + } + } + + for(i = N ; i > 0; i--) { + simdfastset((__m128i *) buffer, bit, data[N - i], i - 1); + } + simdunpack((__m128i *) buffer, backdata, bit); + for (i = 0; i < N; ++i) { + if (data[i] != backdata[N - i - 1]) { + printf("bug\n"); + return -1; + } + } + simdpack(data, (__m128i *) buffer, bit); + for(i = 1 ; i <= N; i++) { + simdfastset((__m128i *) buffer, bit, data[i - 1], i - 1); + } + simdunpack((__m128i *) buffer, backdata, bit); + for (i = 0; i < N; ++i) { + if (data[i] != backdata[i]) { + printf("bug\n"); + return -1; + } + } + + } + free(data); + free(backdata); + free(buffer); + + return 0; +} + +#ifdef __SSE4_1__ + +int testsetd1() { + int bit; + size_t i; + uint32_t newvalue; + const size_t N = 128; + uint32_t * data = malloc(N * sizeof(uint32_t)); + uint32_t * datazeroes = malloc(N * sizeof(uint32_t)); + + uint32_t * backdata = malloc(N * sizeof(uint32_t)); + uint32_t * buffer = malloc((2 * N + 1024) * sizeof(uint32_t)); + + srand(0); + for (bit = 0; bit < 32; ++bit) { + printf("simple set d1 %d \n",bit); + data[0] = rand() & ((1 << bit) - 1); + datazeroes[0] = 0; + + for (i = 1; i < N; ++i) { + data[i] = data[i - 1] + (rand() & ((1 << bit) - 1)); + datazeroes[i] = 0; + } + for (i = 0; i < N; ++i) { + backdata[i] = 0; + } + simdpackd1(0,datazeroes, (__m128i *) buffer, bit); + for(i = 1 ; i <= N; i++) { + simdfastsetd1(0,(__m128i *) buffer, bit, data[i - 1], i - 1); + newvalue = simdselectd1(0, (const __m128i *) buffer, bit,i - 1); + if( newvalue != data[i-1] ) { + printf("bad set-select\n"); + return -1; + } + } + simdunpackd1(0,(__m128i *) buffer, backdata, bit); + for (i = 0; i < N; ++i) { + if (data[i] != backdata[i]) + return -1; + } + } + free(data); + free(backdata); + free(buffer); + free(datazeroes); + return 0; +} +#endif + +int testsetFOR() { + int bit; + size_t i; + uint32_t newvalue; + const size_t N = 128; + uint32_t * data = malloc(N * sizeof(uint32_t)); + uint32_t * datazeroes = malloc(N * sizeof(uint32_t)); + + uint32_t * backdata = malloc(N * sizeof(uint32_t)); + uint32_t * buffer = malloc((2 * N + 1024) * sizeof(uint32_t)); + + srand(0); + for (bit = 0; bit < 32; ++bit) { + printf("simple set FOR %d \n",bit); + for (i = 0; i < N; ++i) { + data[i] = (rand() & ((1 << bit) - 1)); + datazeroes[i] = 0; + } + for (i = 0; i < N; ++i) { + backdata[i] = 0; + } + simdpackFOR(0,datazeroes, (__m128i *) buffer, bit); + for(i = 1 ; i <= N; i++) { + simdfastsetFOR(0,(__m128i *) buffer, bit, data[i - 1], i - 1); + newvalue = simdselectFOR(0, (const __m128i *) buffer, bit,i - 1); + if( newvalue != data[i-1] ) { + printf("bad set-select\n"); + return -1; + } + } + simdunpackFOR(0,(__m128i *) buffer, backdata, bit); + for (i = 0; i < N; ++i) { + if (data[i] != backdata[i]) + return -1; + } + } + free(data); + free(backdata); + free(buffer); + free(datazeroes); + return 0; +} + +int testshortFORpack() { + int bit; + size_t i; + __m128i * rb; + size_t length; + uint32_t offset = 7; + srand(0); + for (bit = 0; bit < 32; ++bit) { + const size_t N = 128; + uint32_t * data = malloc(N * sizeof(uint32_t)); + uint32_t * backdata = malloc(N * sizeof(uint32_t)); + uint32_t * buffer = malloc((2 * N + 1024) * sizeof(uint32_t)); + + for (i = 0; i < N; ++i) { + data[i] = (rand() & ((1 << bit) - 1)) + offset; + } + for (length = 0; length <= N; ++length) { + for (i = 0; i < N; ++i) { + backdata[i] = 0; + } + rb = simdpackFOR_length(offset,data, length, (__m128i *) buffer, + bit); + if(((rb - (__m128i *) buffer)*sizeof(__m128i)) != (unsigned) simdpackFOR_compressedbytes(length,bit)) { + return -1; + } + simdunpackFOR_length(offset,(__m128i *) buffer, length, + backdata, bit); + for (i = 0; i < length; ++i) { + + if (data[i] != backdata[i]) + return -1; + } + } + free(data); + free(backdata); + free(buffer); + } + return 0; +} + + +#ifdef __AVX2__ + +int testbabyavx() { + int bit; + int trial; + unsigned int i,j; + const size_t N = AVXBlockSize; + srand(0); + printf("testbabyavx\n"); + printf("bit = "); + for (bit = 0; bit < 32; ++bit) { + printf(" %d ",bit); + fflush(stdout); + for(trial = 0; trial < 100; ++trial) { + uint32_t * data = malloc(N * sizeof(uint32_t)+ 64 * sizeof(uint32_t)); + uint32_t * backdata = malloc(N * sizeof(uint32_t) + 64 * sizeof(uint32_t) ); + __m256i * buffer = malloc((2 * N + 1024) * sizeof(uint32_t) + 32); + + for (i = 0; i < N; ++i) { + data[i] = rand() & ((uint32_t)(1 << bit) - 1); + } + for (i = 0; i < N; ++i) { + backdata[i] = 0; + } + if(avxmaxbits(data) != maxbits_length(data,N)) { + printf("avxmaxbits is buggy\n"); + return -1; + } + + avxpackwithoutmask(data, buffer, bit); + avxunpack(buffer, backdata, bit); + for (i = 0; i < AVXBlockSize; ++i) { + if (data[i] != backdata[i]) { + printf("bug\n"); + for (j = 0; j < N; ++j) { + if (data[j] != backdata[j]) { + printf("data[%d]=%d v.s. backdata[%d]=%d\n",j,data[j],j,backdata[j]); + } else { + printf("data[%d]=%d\n",j,data[j]); + } + } + return -1; + } + } + free(data); + free(backdata); + free(buffer); + } + } + printf("\n"); + return 0; +} + +int testavx2() { + int N = 5000 * AVXBlockSize, gap; + __m256i * buffer = malloc(AVXBlockSize * sizeof(uint32_t)); + uint32_t * datain = malloc(N * sizeof(uint32_t)); + uint32_t * backbuffer = malloc(AVXBlockSize * sizeof(uint32_t)); + for (gap = 1; gap <= 387420489; gap *= 3) { + int k; + printf(" gap = %u \n", gap); + for (k = 0; k < N; ++k) + datain[k] = k * gap; + for (k = 0; k * AVXBlockSize < N; ++k) { + /* + First part works for general arrays (sorted or unsorted) + */ + int j; + /* we compute the bit width */ + const uint32_t b = avxmaxbits(datain + k * AVXBlockSize); + if(avxmaxbits(datain + k * AVXBlockSize) != maxbits_length(datain + k * AVXBlockSize,AVXBlockSize)) { + printf("avxmaxbits is buggy %d %d \n", + avxmaxbits(datain + k * AVXBlockSize), + maxbits_length(datain + k * AVXBlockSize,AVXBlockSize)); + return -1; + } + printf("bit width = %d\n",b); + + + /* we read 256 integers at "datain + k * AVXBlockSize" and + write b 256-bit vectors at "buffer" */ + avxpackwithoutmask(datain + k * AVXBlockSize, buffer, b); + /* we read back b1 128-bit vectors at "buffer" and write 128 integers at backbuffer */ + avxunpack(buffer, backbuffer, b);/* uncompressed */ + for (j = 0; j < AVXBlockSize; ++j) { + if (backbuffer[j] != datain[k * AVXBlockSize + j]) { + int i; + printf("bug in avxpack\n"); + for(i = 0; i < AVXBlockSize; ++i) { + printf("data[%d]=%d got back %d %s\n",i, + datain[k * AVXBlockSize + i],backbuffer[i], + datain[k * AVXBlockSize + i]!=backbuffer[i]?"bug":""); + } + return -2; + } + } + } + } + free(buffer); + free(datain); + free(backbuffer); + printf("Code looks good.\n"); + return 0; +} +#endif /* avx2 */ + +int test() { + int N = 5000 * SIMDBlockSize, gap; + __m128i * buffer = malloc(SIMDBlockSize * sizeof(uint32_t)); + uint32_t * datain = malloc(N * sizeof(uint32_t)); + uint32_t * backbuffer = malloc(SIMDBlockSize * sizeof(uint32_t)); + for (gap = 1; gap <= 387420489; gap *= 3) { + int k; + printf(" gap = %u \n", gap); + for (k = 0; k < N; ++k) + datain[k] = k * gap; + for (k = 0; k * SIMDBlockSize < N; ++k) { + /* + First part works for general arrays (sorted or unsorted) + */ + int j; + /* we compute the bit width */ + const uint32_t b = maxbits(datain + k * SIMDBlockSize); + /* we read 128 integers at "datain + k * SIMDBlockSize" and + write b 128-bit vectors at "buffer" */ + simdpackwithoutmask(datain + k * SIMDBlockSize, buffer, b); + /* we read back b1 128-bit vectors at "buffer" and write 128 integers at backbuffer */ + simdunpack(buffer, backbuffer, b);/* uncompressed */ + for (j = 0; j < SIMDBlockSize; ++j) { + if (backbuffer[j] != datain[k * SIMDBlockSize + j]) { + printf("bug in simdpack\n"); + return -2; + } + } + + { + /* + next part assumes that the data is sorted (uses differential coding) + */ + uint32_t offset = 0; + /* we compute the bit width */ + const uint32_t b1 = simdmaxbitsd1(offset, + datain + k * SIMDBlockSize); + /* we read 128 integers at "datain + k * SIMDBlockSize" and + write b1 128-bit vectors at "buffer" */ + simdpackwithoutmaskd1(offset, datain + k * SIMDBlockSize, buffer, + b1); + /* we read back b1 128-bit vectors at "buffer" and write 128 integers at backbuffer */ + simdunpackd1(offset, buffer, backbuffer, b1); + for (j = 0; j < SIMDBlockSize; ++j) { + if (backbuffer[j] != datain[k * SIMDBlockSize + j]) { + printf("bug in simdpack d1\n"); + return -3; + } + } + offset = datain[k * SIMDBlockSize + SIMDBlockSize - 1]; + } + } + } + free(buffer); + free(datain); + free(backbuffer); + printf("Code looks good.\n"); + return 0; +} + +#ifdef __SSE4_1__ +int testFOR() { + int N = 5000 * SIMDBlockSize, gap; + __m128i * buffer = malloc(SIMDBlockSize * sizeof(uint32_t)); + uint32_t * datain = malloc(N * sizeof(uint32_t)); + uint32_t * backbuffer = malloc(SIMDBlockSize * sizeof(uint32_t)); + uint32_t tmax, tmin, tb; + for (gap = 1; gap <= 387420489; gap *= 2) { + int k; + printf(" gap = %u \n", gap); + for (k = 0; k < N; ++k) + datain[k] = k * gap; + for (k = 0; k * SIMDBlockSize < N; ++k) { + int j; + simdmaxmin_length(datain + k * SIMDBlockSize,SIMDBlockSize,&tmin,&tmax); + /* we compute the bit width */ + tb = bits(tmax - tmin); + + + /* we read 128 integers at "datain + k * SIMDBlockSize" and + write b 128-bit vectors at "buffer" */ + simdpackFOR(tmin,datain + k * SIMDBlockSize, buffer, tb); + + for (j = 0; j < SIMDBlockSize; ++j) { + uint32_t selectedvalue = simdselectFOR(tmin,buffer,tb,j); + if (selectedvalue != datain[k * SIMDBlockSize + j]) { + printf("bug in simdselectFOR\n"); + return -3; + } + } + /* we read back b1 128-bit vectors at "buffer" and write 128 integers at backbuffer */ + simdunpackFOR(tmin,buffer, backbuffer, tb);/* uncompressed */ + for (j = 0; j < SIMDBlockSize; ++j) { + if (backbuffer[j] != datain[k * SIMDBlockSize + j]) { + printf("bug in simdpackFOR\n"); + return -2; + } + } + } + } + free(buffer); + free(datain); + free(backbuffer); + printf("Code looks good.\n"); + return 0; +} +#endif + +#define MAX 300 +int test_simdmaxbitsd1_length() { + uint32_t result, buffer[MAX + 1]; + int i, j; + + memset(&buffer[0], 0xff, sizeof(buffer)); + + /* this test creates buffers of different length; each buffer is + * initialized to result in the following deltas: + * length 1: 2 + * length 2: 1 2 + * length 3: 1 1 2 + * length 4: 1 1 1 2 + * length 5: 1 1 1 1 2 + * etc. Each sequence's "maxbits" is 2. */ + for (i = 0; i < MAX; i++) { + for (j = 0; j < i; j++) + buffer[j] = j + 1; + buffer[i] = i + 2; + + result = simdmaxbitsd1_length(0, &buffer[0], i + 1); + if (result != 2) { + printf("simdmaxbitsd1_length: unexpected result %u in loop %d\n", + result, i); + return -1; + } + } + printf("simdmaxbitsd1_length: ok\n"); + return 0; +} + +int uint32_cmp(const void *a, const void *b) +{ + const uint32_t *ia = (const uint32_t *)a; + const uint32_t *ib = (const uint32_t *)b; + if(*ia < *ib) + return -1; + else if (*ia > *ib) + return 1; + return 0; +} + +#ifdef __SSE4_1__ +int test_simdpackedsearch() { + uint32_t buffer[128]; + uint32_t result = 0; + int b, i; + uint32_t init = 0; + __m128i initial = _mm_set1_epi32(init); + + /* initialize the buffer */ + for (i = 0; i < 128; i++) + buffer[i] = (uint32_t)(i + 1); + + /* this test creates delta encoded buffers with different bits, then + * performs lower bound searches for each key */ + for (b = 1; b <= 32; b++) { + uint32_t out[128]; + /* delta-encode to 'i' bits */ + simdpackwithoutmaskd1(init, buffer, (__m128i *)out, b); + initial = _mm_setzero_si128(); + printf("simdsearchd1: %d bits\n", b); + + /* now perform the searches */ + initial = _mm_set1_epi32(init); + assert(simdsearchd1(&initial, (__m128i *)out, b, 0, &result) == 0); + assert(result > 0); + + for (i = 1; i <= 128; i++) { + initial = _mm_set1_epi32(init); + assert(simdsearchd1(&initial, (__m128i *)out, b, + (uint32_t)i, &result) == i - 1); + assert(result == (unsigned)i); + } + initial = _mm_set1_epi32(init); + assert(simdsearchd1(&initial, (__m128i *)out, b, 200, &result) + == 128); + assert(result > 200); + } + printf("simdsearchd1: ok\n"); + return 0; +} + +int test_simdpackedsearchFOR() { + uint32_t buffer[128]; + uint32_t result = 0; + int b; + uint32_t i; + uint32_t maxv, tmin, tmax, tb; + uint32_t out[128]; + + /* this test creates delta encoded buffers with different bits, then + * performs lower bound searches for each key */ + for (b = 1; b <= 32; b++) { + /* initialize the buffer */ + maxv = (b == 32) + ? 0xFFFFFFFF + : ((1U< 0) + assert(buffer[pos - 1] < buffer[i]); + assert(result == buffer[i]); + } + for (i = 0; i < 128; i++) { + int pos; + if(buffer[i] == 0) continue; + initial = _mm_set1_epi32(init); + pos = simdsearchd1(&initial, (__m128i *)out, b, + buffer[i] - 1, &result); + assert(pos == simdsearchwithlengthd1(init, (__m128i *)out, b, 128, + buffer[i] - 1, &result)); + assert(buffer[pos] >= buffer[i] - 1); + if(pos > 0) + assert(buffer[pos - 1] < buffer[i] - 1); + assert(result == buffer[pos]); + } + for (i = 0; i < 128; i++) { + int pos; + if (buffer[i] + 1 == 0) + continue; + initial = _mm_set1_epi32(init); + pos = simdsearchd1(&initial, (__m128i *) out, b, + buffer[i] + 1, &result); + assert(pos == simdsearchwithlengthd1(init, (__m128i *)out, b, 128, + buffer[i] + 1, &result)); + if(pos == 128) { + assert(buffer[i] == buffer[127]); + } else { + assert(buffer[pos] >= buffer[i] + 1); + if (pos > 0) + assert(buffer[pos - 1] < buffer[i] + 1); + assert(result == buffer[pos]); + } + } + } + printf("advanced simdsearchd1: ok\n"); + return 0; +} + +int test_simdpackedselect() { + uint32_t buffer[128]; + uint32_t initial = 33; + int b, i; + + /* initialize the buffer */ + for (i = 0; i < 128; i++) + buffer[i] = (uint32_t)(initial + i); + + /* this test creates delta encoded buffers with different bits, then + * performs lower bound searches for each key */ + for (b = 1; b <= 32; b++) { + uint32_t out[128]; + /* delta-encode to 'i' bits */ + simdpackwithoutmaskd1(initial, buffer, (__m128i *)out, b); + + printf("simdselectd1: %d bits\n", b); + + /* now perform the searches */ + for (i = 0; i < 128; i++) { + assert(simdselectd1(initial, (__m128i *)out, b, (uint32_t)i) + == initial + i); + } + } + printf("simdselectd1: ok\n"); + return 0; +} + +int test_simdpackedselect_advanced() { + uint32_t buffer[128]; + uint32_t initial = 33; + uint32_t b; + int i; + + /* this test creates delta encoded buffers with different bits, then + * performs lower bound searches for each key */ + for (b = 0; b <= 32; b++) { + uint32_t prev = initial; + uint32_t out[128]; + /* initialize the buffer */ + for (i = 0; i < 128; i++) { + buffer[i] = ((uint32_t)(165576 * i)) ; + if(b < 32) buffer[i] %= (1< +#include +#include +#include "simdcomp.h" + + +#define get_random_char() (uint8_t)(rand() % 256); + + +int main() { + int N = 5000 * SIMDBlockSize, gap; + __m128i * buffer = malloc(SIMDBlockSize * sizeof(uint32_t)); + uint32_t * datain = malloc(N * sizeof(uint32_t)); + uint32_t * backbuffer = malloc(SIMDBlockSize * sizeof(uint32_t)); + + srand(time(NULL)); + + for (gap = 1; gap <= 387420489; gap *= 3) { + int k; + printf(" gap = %u \n", gap); + + /* simulate some random character string, don't care about endiannes */ + for (k = 0; k < N; ++k) { + uint8_t _tmp[4]; + + _tmp[0] = get_random_char(); + _tmp[1] = get_random_char(); + _tmp[2] = get_random_char(); + _tmp[3] = get_random_char(); + + memmove(&datain[k], _tmp, 4); + } + for (k = 0; k * SIMDBlockSize < N; ++k) { + /* + First part works for general arrays (sorted or unsorted) + */ + int j; + /* we compute the bit width */ + const uint32_t b = maxbits(datain + k * SIMDBlockSize); + /* we read 128 integers at "datain + k * SIMDBlockSize" and + write b 128-bit vectors at "buffer" */ + simdpackwithoutmask(datain + k * SIMDBlockSize, buffer, b); + /* we read back b1 128-bit vectors at "buffer" and write 128 integers at backbuffer */ + simdunpack(buffer, backbuffer, b);/* uncompressed */ + for (j = 0; j < SIMDBlockSize; ++j) { + uint8_t chars_back[4]; + uint8_t chars_in[4]; + + memmove(chars_back, &backbuffer[j], 4); + memmove(chars_in, &datain[k * SIMDBlockSize + j], 4); + + if (chars_in[0] != chars_back[0] + || chars_in[1] != chars_back[1] + || chars_in[2] != chars_back[2] + || chars_in[3] != chars_back[3]) { + printf("bug in simdpack\n"); + return -2; + } + } + + { + /* + next part assumes that the data is sorted (uses differential coding) + */ + uint32_t offset = 0; + /* we compute the bit width */ + const uint32_t b1 = simdmaxbitsd1(offset, + datain + k * SIMDBlockSize); + /* we read 128 integers at "datain + k * SIMDBlockSize" and + write b1 128-bit vectors at "buffer" */ + simdpackwithoutmaskd1(offset, datain + k * SIMDBlockSize, buffer, + b1); + /* we read back b1 128-bit vectors at "buffer" and write 128 integers at backbuffer */ + simdunpackd1(offset, buffer, backbuffer, b1); + for (j = 0; j < SIMDBlockSize; ++j) { + uint8_t chars_back[4]; + uint8_t chars_in[4]; + + memmove(chars_back, &backbuffer[j], 4); + memmove(chars_in, &datain[k * SIMDBlockSize + j], 4); + + if (chars_in[0] != chars_back[0] + || chars_in[1] != chars_back[1] + || chars_in[2] != chars_back[2] + || chars_in[3] != chars_back[3]) { + printf("bug in simdpack\n"); + return -3; + } + } + offset = datain[k * SIMDBlockSize + SIMDBlockSize - 1]; + } + } + } + free(buffer); + free(datain); + free(backbuffer); + printf("Code looks good.\n"); + return 0; +}