Files
tantivy/cpp/simdcomp/src/avxbitpacking.c

7796 lines
428 KiB
C

#include "avxbitpacking.h"
#ifdef __AVX2__
static uint32_t maxbitas32int(const __m256i accumulator) {
const __m256i _tmp1 = _mm256_or_si256(_mm256_srli_si256(accumulator, 8), accumulator); /* (A,B,C,D) xor (0,0,A,B) = (A,B,C xor A,D xor B)*/
const __m256i _tmp2 = _mm256_or_si256(_mm256_srli_si256(_tmp1, 4), _tmp1); /* (A,B,C xor A,D xor B) xor (0,0,0,C xor A)*/
uint32_t ans1 = _mm256_extract_epi32(_tmp2,0);
uint32_t ans2 = _mm256_extract_epi32(_tmp2,4);
uint32_t ans = ans1 > ans2 ? ans1 : ans2;
return bits(ans);
}
uint32_t avxmaxbits(const uint32_t * begin) {
const __m256i* pin = (const __m256i*)(begin);
__m256i accumulator = _mm256_lddqu_si256(pin);
uint32_t k = 1;
for(; 8*k < AVXBlockSize; ++k) {
__m256i newvec = _mm256_lddqu_si256(pin+k);
accumulator = _mm256_or_si256(accumulator,newvec);
}
return maxbitas32int(accumulator);
}
/** code generated by avxpacking.py starts here **/
typedef void (*avxpackblockfnc)(const uint32_t * pin, __m256i * compressed);
typedef void (*avxunpackblockfnc)(const __m256i * compressed, uint32_t * pout);
static void avxpackblock0(const uint32_t * pin, __m256i * compressed) {
(void)compressed;
(void) pin; /* we consumed 256 32-bit integers */
}
/* we are going to pack 256 1-bit values, touching 1 256-bit words, using 16 bytes */
static void avxpackblock1(const uint32_t * pin, __m256i * compressed) {
const __m256i * in = (const __m256i *) pin;
/* we are going to touch 1 256-bit word */
__m256i w0;
w0 = _mm256_lddqu_si256 (in + 0);
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 1) , 1));
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 2) , 2));
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 3) , 3));
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 4) , 4));
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 5) , 5));
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 6) , 6));
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 7) , 7));
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 8) , 8));
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 9) , 9));
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 10) , 10));
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 11) , 11));
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 12) , 12));
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 13) , 13));
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 14) , 14));
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 15) , 15));
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 16) , 16));
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 17) , 17));
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 18) , 18));
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 19) , 19));
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 20) , 20));
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 21) , 21));
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 22) , 22));
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 23) , 23));
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 24) , 24));
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 25) , 25));
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 26) , 26));
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 27) , 27));
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 28) , 28));
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 29) , 29));
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 30) , 30));
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 31) , 31));
_mm256_storeu_si256(compressed + 0, w0);
}
/* we are going to pack 256 2-bit values, touching 2 256-bit words, using 32 bytes */
static void avxpackblock2(const uint32_t * pin, __m256i * compressed) {
const __m256i * in = (const __m256i *) pin;
/* we are going to touch 2 256-bit words */
__m256i w0, w1;
w0 = _mm256_lddqu_si256 (in + 0);
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 1) , 2));
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 2) , 4));
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 3) , 6));
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 4) , 8));
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 5) , 10));
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 6) , 12));
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 7) , 14));
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 8) , 16));
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 9) , 18));
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 10) , 20));
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 11) , 22));
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 12) , 24));
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 13) , 26));
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 14) , 28));
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 15) , 30));
_mm256_storeu_si256(compressed + 0, w0);
w1 = _mm256_lddqu_si256 (in + 16);
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 17) , 2));
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 18) , 4));
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 19) , 6));
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 20) , 8));
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 21) , 10));
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 22) , 12));
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 23) , 14));
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 24) , 16));
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 25) , 18));
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 26) , 20));
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 27) , 22));
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 28) , 24));
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 29) , 26));
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 30) , 28));
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 31) , 30));
_mm256_storeu_si256(compressed + 1, w1);
}
/* we are going to pack 256 3-bit values, touching 3 256-bit words, using 48 bytes */
static void avxpackblock3(const uint32_t * pin, __m256i * compressed) {
const __m256i * in = (const __m256i *) pin;
/* we are going to touch 3 256-bit words */
__m256i w0, w1;
__m256i tmp; /* used to store inputs at word boundary */
w0 = _mm256_lddqu_si256 (in + 0);
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 1) , 3));
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 2) , 6));
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 3) , 9));
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 4) , 12));
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 5) , 15));
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 6) , 18));
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 7) , 21));
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 8) , 24));
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 9) , 27));
tmp = _mm256_lddqu_si256 (in + 10);
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 30));
w1 = _mm256_srli_epi32(tmp,2);
_mm256_storeu_si256(compressed + 0, w0);
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 11) , 1));
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 12) , 4));
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 13) , 7));
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 14) , 10));
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 15) , 13));
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 16) , 16));
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 17) , 19));
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 18) , 22));
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 19) , 25));
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 20) , 28));
tmp = _mm256_lddqu_si256 (in + 21);
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 31));
w0 = _mm256_srli_epi32(tmp,1);
_mm256_storeu_si256(compressed + 1, w1);
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 22) , 2));
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 23) , 5));
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 24) , 8));
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 25) , 11));
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 26) , 14));
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 27) , 17));
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 28) , 20));
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 29) , 23));
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 30) , 26));
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 31) , 29));
_mm256_storeu_si256(compressed + 2, w0);
}
/* we are going to pack 256 4-bit values, touching 4 256-bit words, using 64 bytes */
static void avxpackblock4(const uint32_t * pin, __m256i * compressed) {
const __m256i * in = (const __m256i *) pin;
/* we are going to touch 4 256-bit words */
__m256i w0, w1;
w0 = _mm256_lddqu_si256 (in + 0);
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 1) , 4));
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 2) , 8));
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 3) , 12));
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 4) , 16));
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 5) , 20));
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 6) , 24));
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 7) , 28));
_mm256_storeu_si256(compressed + 0, w0);
w1 = _mm256_lddqu_si256 (in + 8);
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 9) , 4));
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 10) , 8));
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 11) , 12));
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 12) , 16));
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 13) , 20));
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 14) , 24));
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 15) , 28));
_mm256_storeu_si256(compressed + 1, w1);
w0 = _mm256_lddqu_si256 (in + 16);
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 17) , 4));
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 18) , 8));
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 19) , 12));
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 20) , 16));
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 21) , 20));
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 22) , 24));
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 23) , 28));
_mm256_storeu_si256(compressed + 2, w0);
w1 = _mm256_lddqu_si256 (in + 24);
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 25) , 4));
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 26) , 8));
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 27) , 12));
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 28) , 16));
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 29) , 20));
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 30) , 24));
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 31) , 28));
_mm256_storeu_si256(compressed + 3, w1);
}
/* we are going to pack 256 5-bit values, touching 5 256-bit words, using 80 bytes */
static void avxpackblock5(const uint32_t * pin, __m256i * compressed) {
const __m256i * in = (const __m256i *) pin;
/* we are going to touch 5 256-bit words */
__m256i w0, w1;
__m256i tmp; /* used to store inputs at word boundary */
w0 = _mm256_lddqu_si256 (in + 0);
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 1) , 5));
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 2) , 10));
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 3) , 15));
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 4) , 20));
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 5) , 25));
tmp = _mm256_lddqu_si256 (in + 6);
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 30));
w1 = _mm256_srli_epi32(tmp,2);
_mm256_storeu_si256(compressed + 0, w0);
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 7) , 3));
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 8) , 8));
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 9) , 13));
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 10) , 18));
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 11) , 23));
tmp = _mm256_lddqu_si256 (in + 12);
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 28));
w0 = _mm256_srli_epi32(tmp,4);
_mm256_storeu_si256(compressed + 1, w1);
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 13) , 1));
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 14) , 6));
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 15) , 11));
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 16) , 16));
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 17) , 21));
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 18) , 26));
tmp = _mm256_lddqu_si256 (in + 19);
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 31));
w1 = _mm256_srli_epi32(tmp,1);
_mm256_storeu_si256(compressed + 2, w0);
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 20) , 4));
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 21) , 9));
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 22) , 14));
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 23) , 19));
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 24) , 24));
tmp = _mm256_lddqu_si256 (in + 25);
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 29));
w0 = _mm256_srli_epi32(tmp,3);
_mm256_storeu_si256(compressed + 3, w1);
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 26) , 2));
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 27) , 7));
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 28) , 12));
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 29) , 17));
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 30) , 22));
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 31) , 27));
_mm256_storeu_si256(compressed + 4, w0);
}
/* we are going to pack 256 6-bit values, touching 6 256-bit words, using 96 bytes */
static void avxpackblock6(const uint32_t * pin, __m256i * compressed) {
const __m256i * in = (const __m256i *) pin;
/* we are going to touch 6 256-bit words */
__m256i w0, w1;
__m256i tmp; /* used to store inputs at word boundary */
w0 = _mm256_lddqu_si256 (in + 0);
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 1) , 6));
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 2) , 12));
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 3) , 18));
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 4) , 24));
tmp = _mm256_lddqu_si256 (in + 5);
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 30));
w1 = _mm256_srli_epi32(tmp,2);
_mm256_storeu_si256(compressed + 0, w0);
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 6) , 4));
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 7) , 10));
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 8) , 16));
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 9) , 22));
tmp = _mm256_lddqu_si256 (in + 10);
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 28));
w0 = _mm256_srli_epi32(tmp,4);
_mm256_storeu_si256(compressed + 1, w1);
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 11) , 2));
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 12) , 8));
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 13) , 14));
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 14) , 20));
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 15) , 26));
_mm256_storeu_si256(compressed + 2, w0);
w1 = _mm256_lddqu_si256 (in + 16);
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 17) , 6));
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 18) , 12));
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 19) , 18));
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 20) , 24));
tmp = _mm256_lddqu_si256 (in + 21);
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 30));
w0 = _mm256_srli_epi32(tmp,2);
_mm256_storeu_si256(compressed + 3, w1);
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 22) , 4));
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 23) , 10));
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 24) , 16));
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 25) , 22));
tmp = _mm256_lddqu_si256 (in + 26);
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 28));
w1 = _mm256_srli_epi32(tmp,4);
_mm256_storeu_si256(compressed + 4, w0);
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 27) , 2));
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 28) , 8));
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 29) , 14));
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 30) , 20));
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 31) , 26));
_mm256_storeu_si256(compressed + 5, w1);
}
/* we are going to pack 256 7-bit values, touching 7 256-bit words, using 112 bytes */
static void avxpackblock7(const uint32_t * pin, __m256i * compressed) {
const __m256i * in = (const __m256i *) pin;
/* we are going to touch 7 256-bit words */
__m256i w0, w1;
__m256i tmp; /* used to store inputs at word boundary */
w0 = _mm256_lddqu_si256 (in + 0);
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 1) , 7));
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 2) , 14));
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 3) , 21));
tmp = _mm256_lddqu_si256 (in + 4);
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 28));
w1 = _mm256_srli_epi32(tmp,4);
_mm256_storeu_si256(compressed + 0, w0);
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 5) , 3));
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 6) , 10));
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 7) , 17));
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 8) , 24));
tmp = _mm256_lddqu_si256 (in + 9);
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 31));
w0 = _mm256_srli_epi32(tmp,1);
_mm256_storeu_si256(compressed + 1, w1);
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 10) , 6));
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 11) , 13));
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 12) , 20));
tmp = _mm256_lddqu_si256 (in + 13);
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 27));
w1 = _mm256_srli_epi32(tmp,5);
_mm256_storeu_si256(compressed + 2, w0);
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 14) , 2));
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 15) , 9));
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 16) , 16));
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 17) , 23));
tmp = _mm256_lddqu_si256 (in + 18);
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 30));
w0 = _mm256_srli_epi32(tmp,2);
_mm256_storeu_si256(compressed + 3, w1);
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 19) , 5));
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 20) , 12));
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 21) , 19));
tmp = _mm256_lddqu_si256 (in + 22);
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 26));
w1 = _mm256_srli_epi32(tmp,6);
_mm256_storeu_si256(compressed + 4, w0);
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 23) , 1));
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 24) , 8));
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 25) , 15));
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 26) , 22));
tmp = _mm256_lddqu_si256 (in + 27);
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 29));
w0 = _mm256_srli_epi32(tmp,3);
_mm256_storeu_si256(compressed + 5, w1);
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 28) , 4));
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 29) , 11));
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 30) , 18));
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 31) , 25));
_mm256_storeu_si256(compressed + 6, w0);
}
/* we are going to pack 256 8-bit values, touching 8 256-bit words, using 128 bytes */
static void avxpackblock8(const uint32_t * pin, __m256i * compressed) {
const __m256i * in = (const __m256i *) pin;
/* we are going to touch 8 256-bit words */
__m256i w0, w1;
w0 = _mm256_lddqu_si256 (in + 0);
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 1) , 8));
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 2) , 16));
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 3) , 24));
_mm256_storeu_si256(compressed + 0, w0);
w1 = _mm256_lddqu_si256 (in + 4);
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 5) , 8));
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 6) , 16));
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 7) , 24));
_mm256_storeu_si256(compressed + 1, w1);
w0 = _mm256_lddqu_si256 (in + 8);
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 9) , 8));
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 10) , 16));
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 11) , 24));
_mm256_storeu_si256(compressed + 2, w0);
w1 = _mm256_lddqu_si256 (in + 12);
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 13) , 8));
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 14) , 16));
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 15) , 24));
_mm256_storeu_si256(compressed + 3, w1);
w0 = _mm256_lddqu_si256 (in + 16);
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 17) , 8));
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 18) , 16));
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 19) , 24));
_mm256_storeu_si256(compressed + 4, w0);
w1 = _mm256_lddqu_si256 (in + 20);
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 21) , 8));
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 22) , 16));
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 23) , 24));
_mm256_storeu_si256(compressed + 5, w1);
w0 = _mm256_lddqu_si256 (in + 24);
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 25) , 8));
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 26) , 16));
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 27) , 24));
_mm256_storeu_si256(compressed + 6, w0);
w1 = _mm256_lddqu_si256 (in + 28);
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 29) , 8));
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 30) , 16));
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 31) , 24));
_mm256_storeu_si256(compressed + 7, w1);
}
/* we are going to pack 256 9-bit values, touching 9 256-bit words, using 144 bytes */
static void avxpackblock9(const uint32_t * pin, __m256i * compressed) {
const __m256i * in = (const __m256i *) pin;
/* we are going to touch 9 256-bit words */
__m256i w0, w1;
__m256i tmp; /* used to store inputs at word boundary */
w0 = _mm256_lddqu_si256 (in + 0);
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 1) , 9));
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 2) , 18));
tmp = _mm256_lddqu_si256 (in + 3);
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 27));
w1 = _mm256_srli_epi32(tmp,5);
_mm256_storeu_si256(compressed + 0, w0);
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 4) , 4));
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 5) , 13));
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 6) , 22));
tmp = _mm256_lddqu_si256 (in + 7);
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 31));
w0 = _mm256_srli_epi32(tmp,1);
_mm256_storeu_si256(compressed + 1, w1);
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 8) , 8));
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 9) , 17));
tmp = _mm256_lddqu_si256 (in + 10);
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 26));
w1 = _mm256_srli_epi32(tmp,6);
_mm256_storeu_si256(compressed + 2, w0);
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 11) , 3));
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 12) , 12));
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 13) , 21));
tmp = _mm256_lddqu_si256 (in + 14);
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 30));
w0 = _mm256_srli_epi32(tmp,2);
_mm256_storeu_si256(compressed + 3, w1);
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 15) , 7));
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 16) , 16));
tmp = _mm256_lddqu_si256 (in + 17);
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 25));
w1 = _mm256_srli_epi32(tmp,7);
_mm256_storeu_si256(compressed + 4, w0);
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 18) , 2));
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 19) , 11));
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 20) , 20));
tmp = _mm256_lddqu_si256 (in + 21);
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 29));
w0 = _mm256_srli_epi32(tmp,3);
_mm256_storeu_si256(compressed + 5, w1);
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 22) , 6));
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 23) , 15));
tmp = _mm256_lddqu_si256 (in + 24);
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 24));
w1 = _mm256_srli_epi32(tmp,8);
_mm256_storeu_si256(compressed + 6, w0);
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 25) , 1));
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 26) , 10));
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 27) , 19));
tmp = _mm256_lddqu_si256 (in + 28);
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 28));
w0 = _mm256_srli_epi32(tmp,4);
_mm256_storeu_si256(compressed + 7, w1);
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 29) , 5));
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 30) , 14));
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 31) , 23));
_mm256_storeu_si256(compressed + 8, w0);
}
/* we are going to pack 256 10-bit values, touching 10 256-bit words, using 160 bytes */
static void avxpackblock10(const uint32_t * pin, __m256i * compressed) {
const __m256i * in = (const __m256i *) pin;
/* we are going to touch 10 256-bit words */
__m256i w0, w1;
__m256i tmp; /* used to store inputs at word boundary */
w0 = _mm256_lddqu_si256 (in + 0);
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 1) , 10));
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 2) , 20));
tmp = _mm256_lddqu_si256 (in + 3);
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 30));
w1 = _mm256_srli_epi32(tmp,2);
_mm256_storeu_si256(compressed + 0, w0);
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 4) , 8));
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 5) , 18));
tmp = _mm256_lddqu_si256 (in + 6);
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 28));
w0 = _mm256_srli_epi32(tmp,4);
_mm256_storeu_si256(compressed + 1, w1);
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 7) , 6));
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 8) , 16));
tmp = _mm256_lddqu_si256 (in + 9);
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 26));
w1 = _mm256_srli_epi32(tmp,6);
_mm256_storeu_si256(compressed + 2, w0);
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 10) , 4));
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 11) , 14));
tmp = _mm256_lddqu_si256 (in + 12);
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 24));
w0 = _mm256_srli_epi32(tmp,8);
_mm256_storeu_si256(compressed + 3, w1);
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 13) , 2));
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 14) , 12));
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 15) , 22));
_mm256_storeu_si256(compressed + 4, w0);
w1 = _mm256_lddqu_si256 (in + 16);
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 17) , 10));
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 18) , 20));
tmp = _mm256_lddqu_si256 (in + 19);
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 30));
w0 = _mm256_srli_epi32(tmp,2);
_mm256_storeu_si256(compressed + 5, w1);
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 20) , 8));
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 21) , 18));
tmp = _mm256_lddqu_si256 (in + 22);
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 28));
w1 = _mm256_srli_epi32(tmp,4);
_mm256_storeu_si256(compressed + 6, w0);
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 23) , 6));
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 24) , 16));
tmp = _mm256_lddqu_si256 (in + 25);
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 26));
w0 = _mm256_srli_epi32(tmp,6);
_mm256_storeu_si256(compressed + 7, w1);
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 26) , 4));
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 27) , 14));
tmp = _mm256_lddqu_si256 (in + 28);
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 24));
w1 = _mm256_srli_epi32(tmp,8);
_mm256_storeu_si256(compressed + 8, w0);
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 29) , 2));
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 30) , 12));
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 31) , 22));
_mm256_storeu_si256(compressed + 9, w1);
}
/* we are going to pack 256 11-bit values, touching 11 256-bit words, using 176 bytes */
static void avxpackblock11(const uint32_t * pin, __m256i * compressed) {
const __m256i * in = (const __m256i *) pin;
/* we are going to touch 11 256-bit words */
__m256i w0, w1;
__m256i tmp; /* used to store inputs at word boundary */
w0 = _mm256_lddqu_si256 (in + 0);
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 1) , 11));
tmp = _mm256_lddqu_si256 (in + 2);
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 22));
w1 = _mm256_srli_epi32(tmp,10);
_mm256_storeu_si256(compressed + 0, w0);
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 3) , 1));
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 4) , 12));
tmp = _mm256_lddqu_si256 (in + 5);
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 23));
w0 = _mm256_srli_epi32(tmp,9);
_mm256_storeu_si256(compressed + 1, w1);
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 6) , 2));
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 7) , 13));
tmp = _mm256_lddqu_si256 (in + 8);
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 24));
w1 = _mm256_srli_epi32(tmp,8);
_mm256_storeu_si256(compressed + 2, w0);
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 9) , 3));
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 10) , 14));
tmp = _mm256_lddqu_si256 (in + 11);
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 25));
w0 = _mm256_srli_epi32(tmp,7);
_mm256_storeu_si256(compressed + 3, w1);
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 12) , 4));
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 13) , 15));
tmp = _mm256_lddqu_si256 (in + 14);
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 26));
w1 = _mm256_srli_epi32(tmp,6);
_mm256_storeu_si256(compressed + 4, w0);
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 15) , 5));
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 16) , 16));
tmp = _mm256_lddqu_si256 (in + 17);
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 27));
w0 = _mm256_srli_epi32(tmp,5);
_mm256_storeu_si256(compressed + 5, w1);
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 18) , 6));
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 19) , 17));
tmp = _mm256_lddqu_si256 (in + 20);
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 28));
w1 = _mm256_srli_epi32(tmp,4);
_mm256_storeu_si256(compressed + 6, w0);
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 21) , 7));
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 22) , 18));
tmp = _mm256_lddqu_si256 (in + 23);
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 29));
w0 = _mm256_srli_epi32(tmp,3);
_mm256_storeu_si256(compressed + 7, w1);
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 24) , 8));
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 25) , 19));
tmp = _mm256_lddqu_si256 (in + 26);
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 30));
w1 = _mm256_srli_epi32(tmp,2);
_mm256_storeu_si256(compressed + 8, w0);
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 27) , 9));
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 28) , 20));
tmp = _mm256_lddqu_si256 (in + 29);
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 31));
w0 = _mm256_srli_epi32(tmp,1);
_mm256_storeu_si256(compressed + 9, w1);
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 30) , 10));
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 31) , 21));
_mm256_storeu_si256(compressed + 10, w0);
}
/* we are going to pack 256 12-bit values, touching 12 256-bit words, using 192 bytes */
static void avxpackblock12(const uint32_t * pin, __m256i * compressed) {
const __m256i * in = (const __m256i *) pin;
/* we are going to touch 12 256-bit words */
__m256i w0, w1;
__m256i tmp; /* used to store inputs at word boundary */
w0 = _mm256_lddqu_si256 (in + 0);
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 1) , 12));
tmp = _mm256_lddqu_si256 (in + 2);
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 24));
w1 = _mm256_srli_epi32(tmp,8);
_mm256_storeu_si256(compressed + 0, w0);
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 3) , 4));
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 4) , 16));
tmp = _mm256_lddqu_si256 (in + 5);
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 28));
w0 = _mm256_srli_epi32(tmp,4);
_mm256_storeu_si256(compressed + 1, w1);
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 6) , 8));
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 7) , 20));
_mm256_storeu_si256(compressed + 2, w0);
w1 = _mm256_lddqu_si256 (in + 8);
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 9) , 12));
tmp = _mm256_lddqu_si256 (in + 10);
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 24));
w0 = _mm256_srli_epi32(tmp,8);
_mm256_storeu_si256(compressed + 3, w1);
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 11) , 4));
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 12) , 16));
tmp = _mm256_lddqu_si256 (in + 13);
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 28));
w1 = _mm256_srli_epi32(tmp,4);
_mm256_storeu_si256(compressed + 4, w0);
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 14) , 8));
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 15) , 20));
_mm256_storeu_si256(compressed + 5, w1);
w0 = _mm256_lddqu_si256 (in + 16);
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 17) , 12));
tmp = _mm256_lddqu_si256 (in + 18);
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 24));
w1 = _mm256_srli_epi32(tmp,8);
_mm256_storeu_si256(compressed + 6, w0);
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 19) , 4));
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 20) , 16));
tmp = _mm256_lddqu_si256 (in + 21);
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 28));
w0 = _mm256_srli_epi32(tmp,4);
_mm256_storeu_si256(compressed + 7, w1);
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 22) , 8));
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 23) , 20));
_mm256_storeu_si256(compressed + 8, w0);
w1 = _mm256_lddqu_si256 (in + 24);
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 25) , 12));
tmp = _mm256_lddqu_si256 (in + 26);
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 24));
w0 = _mm256_srli_epi32(tmp,8);
_mm256_storeu_si256(compressed + 9, w1);
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 27) , 4));
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 28) , 16));
tmp = _mm256_lddqu_si256 (in + 29);
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 28));
w1 = _mm256_srli_epi32(tmp,4);
_mm256_storeu_si256(compressed + 10, w0);
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 30) , 8));
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 31) , 20));
_mm256_storeu_si256(compressed + 11, w1);
}
/* we are going to pack 256 13-bit values, touching 13 256-bit words, using 208 bytes */
static void avxpackblock13(const uint32_t * pin, __m256i * compressed) {
const __m256i * in = (const __m256i *) pin;
/* we are going to touch 13 256-bit words */
__m256i w0, w1;
__m256i tmp; /* used to store inputs at word boundary */
w0 = _mm256_lddqu_si256 (in + 0);
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 1) , 13));
tmp = _mm256_lddqu_si256 (in + 2);
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 26));
w1 = _mm256_srli_epi32(tmp,6);
_mm256_storeu_si256(compressed + 0, w0);
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 3) , 7));
tmp = _mm256_lddqu_si256 (in + 4);
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 20));
w0 = _mm256_srli_epi32(tmp,12);
_mm256_storeu_si256(compressed + 1, w1);
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 5) , 1));
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 6) , 14));
tmp = _mm256_lddqu_si256 (in + 7);
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 27));
w1 = _mm256_srli_epi32(tmp,5);
_mm256_storeu_si256(compressed + 2, w0);
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 8) , 8));
tmp = _mm256_lddqu_si256 (in + 9);
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 21));
w0 = _mm256_srli_epi32(tmp,11);
_mm256_storeu_si256(compressed + 3, w1);
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 10) , 2));
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 11) , 15));
tmp = _mm256_lddqu_si256 (in + 12);
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 28));
w1 = _mm256_srli_epi32(tmp,4);
_mm256_storeu_si256(compressed + 4, w0);
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 13) , 9));
tmp = _mm256_lddqu_si256 (in + 14);
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 22));
w0 = _mm256_srli_epi32(tmp,10);
_mm256_storeu_si256(compressed + 5, w1);
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 15) , 3));
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 16) , 16));
tmp = _mm256_lddqu_si256 (in + 17);
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 29));
w1 = _mm256_srli_epi32(tmp,3);
_mm256_storeu_si256(compressed + 6, w0);
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 18) , 10));
tmp = _mm256_lddqu_si256 (in + 19);
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 23));
w0 = _mm256_srli_epi32(tmp,9);
_mm256_storeu_si256(compressed + 7, w1);
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 20) , 4));
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 21) , 17));
tmp = _mm256_lddqu_si256 (in + 22);
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 30));
w1 = _mm256_srli_epi32(tmp,2);
_mm256_storeu_si256(compressed + 8, w0);
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 23) , 11));
tmp = _mm256_lddqu_si256 (in + 24);
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 24));
w0 = _mm256_srli_epi32(tmp,8);
_mm256_storeu_si256(compressed + 9, w1);
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 25) , 5));
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 26) , 18));
tmp = _mm256_lddqu_si256 (in + 27);
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 31));
w1 = _mm256_srli_epi32(tmp,1);
_mm256_storeu_si256(compressed + 10, w0);
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 28) , 12));
tmp = _mm256_lddqu_si256 (in + 29);
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 25));
w0 = _mm256_srli_epi32(tmp,7);
_mm256_storeu_si256(compressed + 11, w1);
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 30) , 6));
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 31) , 19));
_mm256_storeu_si256(compressed + 12, w0);
}
/* we are going to pack 256 14-bit values, touching 14 256-bit words, using 224 bytes */
static void avxpackblock14(const uint32_t * pin, __m256i * compressed) {
const __m256i * in = (const __m256i *) pin;
/* we are going to touch 14 256-bit words */
__m256i w0, w1;
__m256i tmp; /* used to store inputs at word boundary */
w0 = _mm256_lddqu_si256 (in + 0);
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 1) , 14));
tmp = _mm256_lddqu_si256 (in + 2);
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 28));
w1 = _mm256_srli_epi32(tmp,4);
_mm256_storeu_si256(compressed + 0, w0);
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 3) , 10));
tmp = _mm256_lddqu_si256 (in + 4);
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 24));
w0 = _mm256_srli_epi32(tmp,8);
_mm256_storeu_si256(compressed + 1, w1);
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 5) , 6));
tmp = _mm256_lddqu_si256 (in + 6);
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 20));
w1 = _mm256_srli_epi32(tmp,12);
_mm256_storeu_si256(compressed + 2, w0);
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 7) , 2));
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 8) , 16));
tmp = _mm256_lddqu_si256 (in + 9);
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 30));
w0 = _mm256_srli_epi32(tmp,2);
_mm256_storeu_si256(compressed + 3, w1);
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 10) , 12));
tmp = _mm256_lddqu_si256 (in + 11);
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 26));
w1 = _mm256_srli_epi32(tmp,6);
_mm256_storeu_si256(compressed + 4, w0);
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 12) , 8));
tmp = _mm256_lddqu_si256 (in + 13);
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 22));
w0 = _mm256_srli_epi32(tmp,10);
_mm256_storeu_si256(compressed + 5, w1);
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 14) , 4));
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 15) , 18));
_mm256_storeu_si256(compressed + 6, w0);
w1 = _mm256_lddqu_si256 (in + 16);
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 17) , 14));
tmp = _mm256_lddqu_si256 (in + 18);
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 28));
w0 = _mm256_srli_epi32(tmp,4);
_mm256_storeu_si256(compressed + 7, w1);
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 19) , 10));
tmp = _mm256_lddqu_si256 (in + 20);
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 24));
w1 = _mm256_srli_epi32(tmp,8);
_mm256_storeu_si256(compressed + 8, w0);
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 21) , 6));
tmp = _mm256_lddqu_si256 (in + 22);
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 20));
w0 = _mm256_srli_epi32(tmp,12);
_mm256_storeu_si256(compressed + 9, w1);
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 23) , 2));
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 24) , 16));
tmp = _mm256_lddqu_si256 (in + 25);
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 30));
w1 = _mm256_srli_epi32(tmp,2);
_mm256_storeu_si256(compressed + 10, w0);
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 26) , 12));
tmp = _mm256_lddqu_si256 (in + 27);
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 26));
w0 = _mm256_srli_epi32(tmp,6);
_mm256_storeu_si256(compressed + 11, w1);
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 28) , 8));
tmp = _mm256_lddqu_si256 (in + 29);
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 22));
w1 = _mm256_srli_epi32(tmp,10);
_mm256_storeu_si256(compressed + 12, w0);
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 30) , 4));
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 31) , 18));
_mm256_storeu_si256(compressed + 13, w1);
}
/* we are going to pack 256 15-bit values, touching 15 256-bit words, using 240 bytes */
static void avxpackblock15(const uint32_t * pin, __m256i * compressed) {
const __m256i * in = (const __m256i *) pin;
/* we are going to touch 15 256-bit words */
__m256i w0, w1;
__m256i tmp; /* used to store inputs at word boundary */
w0 = _mm256_lddqu_si256 (in + 0);
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 1) , 15));
tmp = _mm256_lddqu_si256 (in + 2);
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 30));
w1 = _mm256_srli_epi32(tmp,2);
_mm256_storeu_si256(compressed + 0, w0);
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 3) , 13));
tmp = _mm256_lddqu_si256 (in + 4);
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 28));
w0 = _mm256_srli_epi32(tmp,4);
_mm256_storeu_si256(compressed + 1, w1);
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 5) , 11));
tmp = _mm256_lddqu_si256 (in + 6);
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 26));
w1 = _mm256_srli_epi32(tmp,6);
_mm256_storeu_si256(compressed + 2, w0);
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 7) , 9));
tmp = _mm256_lddqu_si256 (in + 8);
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 24));
w0 = _mm256_srli_epi32(tmp,8);
_mm256_storeu_si256(compressed + 3, w1);
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 9) , 7));
tmp = _mm256_lddqu_si256 (in + 10);
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 22));
w1 = _mm256_srli_epi32(tmp,10);
_mm256_storeu_si256(compressed + 4, w0);
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 11) , 5));
tmp = _mm256_lddqu_si256 (in + 12);
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 20));
w0 = _mm256_srli_epi32(tmp,12);
_mm256_storeu_si256(compressed + 5, w1);
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 13) , 3));
tmp = _mm256_lddqu_si256 (in + 14);
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 18));
w1 = _mm256_srli_epi32(tmp,14);
_mm256_storeu_si256(compressed + 6, w0);
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 15) , 1));
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 16) , 16));
tmp = _mm256_lddqu_si256 (in + 17);
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 31));
w0 = _mm256_srli_epi32(tmp,1);
_mm256_storeu_si256(compressed + 7, w1);
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 18) , 14));
tmp = _mm256_lddqu_si256 (in + 19);
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 29));
w1 = _mm256_srli_epi32(tmp,3);
_mm256_storeu_si256(compressed + 8, w0);
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 20) , 12));
tmp = _mm256_lddqu_si256 (in + 21);
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 27));
w0 = _mm256_srli_epi32(tmp,5);
_mm256_storeu_si256(compressed + 9, w1);
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 22) , 10));
tmp = _mm256_lddqu_si256 (in + 23);
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 25));
w1 = _mm256_srli_epi32(tmp,7);
_mm256_storeu_si256(compressed + 10, w0);
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 24) , 8));
tmp = _mm256_lddqu_si256 (in + 25);
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 23));
w0 = _mm256_srli_epi32(tmp,9);
_mm256_storeu_si256(compressed + 11, w1);
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 26) , 6));
tmp = _mm256_lddqu_si256 (in + 27);
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 21));
w1 = _mm256_srli_epi32(tmp,11);
_mm256_storeu_si256(compressed + 12, w0);
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 28) , 4));
tmp = _mm256_lddqu_si256 (in + 29);
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 19));
w0 = _mm256_srli_epi32(tmp,13);
_mm256_storeu_si256(compressed + 13, w1);
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 30) , 2));
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 31) , 17));
_mm256_storeu_si256(compressed + 14, w0);
}
/* we are going to pack 256 16-bit values, touching 16 256-bit words, using 256 bytes */
static void avxpackblock16(const uint32_t * pin, __m256i * compressed) {
const __m256i * in = (const __m256i *) pin;
/* we are going to touch 16 256-bit words */
__m256i w0, w1;
w0 = _mm256_lddqu_si256 (in + 0);
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 1) , 16));
_mm256_storeu_si256(compressed + 0, w0);
w1 = _mm256_lddqu_si256 (in + 2);
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 3) , 16));
_mm256_storeu_si256(compressed + 1, w1);
w0 = _mm256_lddqu_si256 (in + 4);
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 5) , 16));
_mm256_storeu_si256(compressed + 2, w0);
w1 = _mm256_lddqu_si256 (in + 6);
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 7) , 16));
_mm256_storeu_si256(compressed + 3, w1);
w0 = _mm256_lddqu_si256 (in + 8);
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 9) , 16));
_mm256_storeu_si256(compressed + 4, w0);
w1 = _mm256_lddqu_si256 (in + 10);
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 11) , 16));
_mm256_storeu_si256(compressed + 5, w1);
w0 = _mm256_lddqu_si256 (in + 12);
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 13) , 16));
_mm256_storeu_si256(compressed + 6, w0);
w1 = _mm256_lddqu_si256 (in + 14);
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 15) , 16));
_mm256_storeu_si256(compressed + 7, w1);
w0 = _mm256_lddqu_si256 (in + 16);
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 17) , 16));
_mm256_storeu_si256(compressed + 8, w0);
w1 = _mm256_lddqu_si256 (in + 18);
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 19) , 16));
_mm256_storeu_si256(compressed + 9, w1);
w0 = _mm256_lddqu_si256 (in + 20);
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 21) , 16));
_mm256_storeu_si256(compressed + 10, w0);
w1 = _mm256_lddqu_si256 (in + 22);
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 23) , 16));
_mm256_storeu_si256(compressed + 11, w1);
w0 = _mm256_lddqu_si256 (in + 24);
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 25) , 16));
_mm256_storeu_si256(compressed + 12, w0);
w1 = _mm256_lddqu_si256 (in + 26);
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 27) , 16));
_mm256_storeu_si256(compressed + 13, w1);
w0 = _mm256_lddqu_si256 (in + 28);
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 29) , 16));
_mm256_storeu_si256(compressed + 14, w0);
w1 = _mm256_lddqu_si256 (in + 30);
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 31) , 16));
_mm256_storeu_si256(compressed + 15, w1);
}
/* we are going to pack 256 17-bit values, touching 17 256-bit words, using 272 bytes */
static void avxpackblock17(const uint32_t * pin, __m256i * compressed) {
const __m256i * in = (const __m256i *) pin;
/* we are going to touch 17 256-bit words */
__m256i w0, w1;
__m256i tmp; /* used to store inputs at word boundary */
w0 = _mm256_lddqu_si256 (in + 0);
tmp = _mm256_lddqu_si256 (in + 1);
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 17));
w1 = _mm256_srli_epi32(tmp,15);
_mm256_storeu_si256(compressed + 0, w0);
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 2) , 2));
tmp = _mm256_lddqu_si256 (in + 3);
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 19));
w0 = _mm256_srli_epi32(tmp,13);
_mm256_storeu_si256(compressed + 1, w1);
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 4) , 4));
tmp = _mm256_lddqu_si256 (in + 5);
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 21));
w1 = _mm256_srli_epi32(tmp,11);
_mm256_storeu_si256(compressed + 2, w0);
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 6) , 6));
tmp = _mm256_lddqu_si256 (in + 7);
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 23));
w0 = _mm256_srli_epi32(tmp,9);
_mm256_storeu_si256(compressed + 3, w1);
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 8) , 8));
tmp = _mm256_lddqu_si256 (in + 9);
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 25));
w1 = _mm256_srli_epi32(tmp,7);
_mm256_storeu_si256(compressed + 4, w0);
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 10) , 10));
tmp = _mm256_lddqu_si256 (in + 11);
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 27));
w0 = _mm256_srli_epi32(tmp,5);
_mm256_storeu_si256(compressed + 5, w1);
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 12) , 12));
tmp = _mm256_lddqu_si256 (in + 13);
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 29));
w1 = _mm256_srli_epi32(tmp,3);
_mm256_storeu_si256(compressed + 6, w0);
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 14) , 14));
tmp = _mm256_lddqu_si256 (in + 15);
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 31));
w0 = _mm256_srli_epi32(tmp,1);
_mm256_storeu_si256(compressed + 7, w1);
tmp = _mm256_lddqu_si256 (in + 16);
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 16));
w1 = _mm256_srli_epi32(tmp,16);
_mm256_storeu_si256(compressed + 8, w0);
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 17) , 1));
tmp = _mm256_lddqu_si256 (in + 18);
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 18));
w0 = _mm256_srli_epi32(tmp,14);
_mm256_storeu_si256(compressed + 9, w1);
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 19) , 3));
tmp = _mm256_lddqu_si256 (in + 20);
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 20));
w1 = _mm256_srli_epi32(tmp,12);
_mm256_storeu_si256(compressed + 10, w0);
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 21) , 5));
tmp = _mm256_lddqu_si256 (in + 22);
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 22));
w0 = _mm256_srli_epi32(tmp,10);
_mm256_storeu_si256(compressed + 11, w1);
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 23) , 7));
tmp = _mm256_lddqu_si256 (in + 24);
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 24));
w1 = _mm256_srli_epi32(tmp,8);
_mm256_storeu_si256(compressed + 12, w0);
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 25) , 9));
tmp = _mm256_lddqu_si256 (in + 26);
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 26));
w0 = _mm256_srli_epi32(tmp,6);
_mm256_storeu_si256(compressed + 13, w1);
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 27) , 11));
tmp = _mm256_lddqu_si256 (in + 28);
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 28));
w1 = _mm256_srli_epi32(tmp,4);
_mm256_storeu_si256(compressed + 14, w0);
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 29) , 13));
tmp = _mm256_lddqu_si256 (in + 30);
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 30));
w0 = _mm256_srli_epi32(tmp,2);
_mm256_storeu_si256(compressed + 15, w1);
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 31) , 15));
_mm256_storeu_si256(compressed + 16, w0);
}
/* we are going to pack 256 18-bit values, touching 18 256-bit words, using 288 bytes */
static void avxpackblock18(const uint32_t * pin, __m256i * compressed) {
const __m256i * in = (const __m256i *) pin;
/* we are going to touch 18 256-bit words */
__m256i w0, w1;
__m256i tmp; /* used to store inputs at word boundary */
w0 = _mm256_lddqu_si256 (in + 0);
tmp = _mm256_lddqu_si256 (in + 1);
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 18));
w1 = _mm256_srli_epi32(tmp,14);
_mm256_storeu_si256(compressed + 0, w0);
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 2) , 4));
tmp = _mm256_lddqu_si256 (in + 3);
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 22));
w0 = _mm256_srli_epi32(tmp,10);
_mm256_storeu_si256(compressed + 1, w1);
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 4) , 8));
tmp = _mm256_lddqu_si256 (in + 5);
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 26));
w1 = _mm256_srli_epi32(tmp,6);
_mm256_storeu_si256(compressed + 2, w0);
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 6) , 12));
tmp = _mm256_lddqu_si256 (in + 7);
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 30));
w0 = _mm256_srli_epi32(tmp,2);
_mm256_storeu_si256(compressed + 3, w1);
tmp = _mm256_lddqu_si256 (in + 8);
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 16));
w1 = _mm256_srli_epi32(tmp,16);
_mm256_storeu_si256(compressed + 4, w0);
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 9) , 2));
tmp = _mm256_lddqu_si256 (in + 10);
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 20));
w0 = _mm256_srli_epi32(tmp,12);
_mm256_storeu_si256(compressed + 5, w1);
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 11) , 6));
tmp = _mm256_lddqu_si256 (in + 12);
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 24));
w1 = _mm256_srli_epi32(tmp,8);
_mm256_storeu_si256(compressed + 6, w0);
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 13) , 10));
tmp = _mm256_lddqu_si256 (in + 14);
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 28));
w0 = _mm256_srli_epi32(tmp,4);
_mm256_storeu_si256(compressed + 7, w1);
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 15) , 14));
_mm256_storeu_si256(compressed + 8, w0);
w1 = _mm256_lddqu_si256 (in + 16);
tmp = _mm256_lddqu_si256 (in + 17);
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 18));
w0 = _mm256_srli_epi32(tmp,14);
_mm256_storeu_si256(compressed + 9, w1);
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 18) , 4));
tmp = _mm256_lddqu_si256 (in + 19);
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 22));
w1 = _mm256_srli_epi32(tmp,10);
_mm256_storeu_si256(compressed + 10, w0);
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 20) , 8));
tmp = _mm256_lddqu_si256 (in + 21);
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 26));
w0 = _mm256_srli_epi32(tmp,6);
_mm256_storeu_si256(compressed + 11, w1);
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 22) , 12));
tmp = _mm256_lddqu_si256 (in + 23);
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 30));
w1 = _mm256_srli_epi32(tmp,2);
_mm256_storeu_si256(compressed + 12, w0);
tmp = _mm256_lddqu_si256 (in + 24);
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 16));
w0 = _mm256_srli_epi32(tmp,16);
_mm256_storeu_si256(compressed + 13, w1);
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 25) , 2));
tmp = _mm256_lddqu_si256 (in + 26);
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 20));
w1 = _mm256_srli_epi32(tmp,12);
_mm256_storeu_si256(compressed + 14, w0);
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 27) , 6));
tmp = _mm256_lddqu_si256 (in + 28);
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 24));
w0 = _mm256_srli_epi32(tmp,8);
_mm256_storeu_si256(compressed + 15, w1);
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 29) , 10));
tmp = _mm256_lddqu_si256 (in + 30);
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 28));
w1 = _mm256_srli_epi32(tmp,4);
_mm256_storeu_si256(compressed + 16, w0);
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 31) , 14));
_mm256_storeu_si256(compressed + 17, w1);
}
/* we are going to pack 256 19-bit values, touching 19 256-bit words, using 304 bytes */
static void avxpackblock19(const uint32_t * pin, __m256i * compressed) {
const __m256i * in = (const __m256i *) pin;
/* we are going to touch 19 256-bit words */
__m256i w0, w1;
__m256i tmp; /* used to store inputs at word boundary */
w0 = _mm256_lddqu_si256 (in + 0);
tmp = _mm256_lddqu_si256 (in + 1);
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 19));
w1 = _mm256_srli_epi32(tmp,13);
_mm256_storeu_si256(compressed + 0, w0);
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 2) , 6));
tmp = _mm256_lddqu_si256 (in + 3);
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 25));
w0 = _mm256_srli_epi32(tmp,7);
_mm256_storeu_si256(compressed + 1, w1);
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 4) , 12));
tmp = _mm256_lddqu_si256 (in + 5);
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 31));
w1 = _mm256_srli_epi32(tmp,1);
_mm256_storeu_si256(compressed + 2, w0);
tmp = _mm256_lddqu_si256 (in + 6);
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 18));
w0 = _mm256_srli_epi32(tmp,14);
_mm256_storeu_si256(compressed + 3, w1);
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 7) , 5));
tmp = _mm256_lddqu_si256 (in + 8);
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 24));
w1 = _mm256_srli_epi32(tmp,8);
_mm256_storeu_si256(compressed + 4, w0);
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 9) , 11));
tmp = _mm256_lddqu_si256 (in + 10);
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 30));
w0 = _mm256_srli_epi32(tmp,2);
_mm256_storeu_si256(compressed + 5, w1);
tmp = _mm256_lddqu_si256 (in + 11);
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 17));
w1 = _mm256_srli_epi32(tmp,15);
_mm256_storeu_si256(compressed + 6, w0);
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 12) , 4));
tmp = _mm256_lddqu_si256 (in + 13);
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 23));
w0 = _mm256_srli_epi32(tmp,9);
_mm256_storeu_si256(compressed + 7, w1);
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 14) , 10));
tmp = _mm256_lddqu_si256 (in + 15);
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 29));
w1 = _mm256_srli_epi32(tmp,3);
_mm256_storeu_si256(compressed + 8, w0);
tmp = _mm256_lddqu_si256 (in + 16);
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 16));
w0 = _mm256_srli_epi32(tmp,16);
_mm256_storeu_si256(compressed + 9, w1);
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 17) , 3));
tmp = _mm256_lddqu_si256 (in + 18);
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 22));
w1 = _mm256_srli_epi32(tmp,10);
_mm256_storeu_si256(compressed + 10, w0);
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 19) , 9));
tmp = _mm256_lddqu_si256 (in + 20);
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 28));
w0 = _mm256_srli_epi32(tmp,4);
_mm256_storeu_si256(compressed + 11, w1);
tmp = _mm256_lddqu_si256 (in + 21);
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 15));
w1 = _mm256_srli_epi32(tmp,17);
_mm256_storeu_si256(compressed + 12, w0);
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 22) , 2));
tmp = _mm256_lddqu_si256 (in + 23);
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 21));
w0 = _mm256_srli_epi32(tmp,11);
_mm256_storeu_si256(compressed + 13, w1);
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 24) , 8));
tmp = _mm256_lddqu_si256 (in + 25);
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 27));
w1 = _mm256_srli_epi32(tmp,5);
_mm256_storeu_si256(compressed + 14, w0);
tmp = _mm256_lddqu_si256 (in + 26);
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 14));
w0 = _mm256_srli_epi32(tmp,18);
_mm256_storeu_si256(compressed + 15, w1);
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 27) , 1));
tmp = _mm256_lddqu_si256 (in + 28);
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 20));
w1 = _mm256_srli_epi32(tmp,12);
_mm256_storeu_si256(compressed + 16, w0);
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 29) , 7));
tmp = _mm256_lddqu_si256 (in + 30);
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 26));
w0 = _mm256_srli_epi32(tmp,6);
_mm256_storeu_si256(compressed + 17, w1);
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 31) , 13));
_mm256_storeu_si256(compressed + 18, w0);
}
/* we are going to pack 256 20-bit values, touching 20 256-bit words, using 320 bytes */
static void avxpackblock20(const uint32_t * pin, __m256i * compressed) {
const __m256i * in = (const __m256i *) pin;
/* we are going to touch 20 256-bit words */
__m256i w0, w1;
__m256i tmp; /* used to store inputs at word boundary */
w0 = _mm256_lddqu_si256 (in + 0);
tmp = _mm256_lddqu_si256 (in + 1);
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 20));
w1 = _mm256_srli_epi32(tmp,12);
_mm256_storeu_si256(compressed + 0, w0);
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 2) , 8));
tmp = _mm256_lddqu_si256 (in + 3);
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 28));
w0 = _mm256_srli_epi32(tmp,4);
_mm256_storeu_si256(compressed + 1, w1);
tmp = _mm256_lddqu_si256 (in + 4);
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 16));
w1 = _mm256_srli_epi32(tmp,16);
_mm256_storeu_si256(compressed + 2, w0);
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 5) , 4));
tmp = _mm256_lddqu_si256 (in + 6);
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 24));
w0 = _mm256_srli_epi32(tmp,8);
_mm256_storeu_si256(compressed + 3, w1);
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 7) , 12));
_mm256_storeu_si256(compressed + 4, w0);
w1 = _mm256_lddqu_si256 (in + 8);
tmp = _mm256_lddqu_si256 (in + 9);
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 20));
w0 = _mm256_srli_epi32(tmp,12);
_mm256_storeu_si256(compressed + 5, w1);
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 10) , 8));
tmp = _mm256_lddqu_si256 (in + 11);
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 28));
w1 = _mm256_srli_epi32(tmp,4);
_mm256_storeu_si256(compressed + 6, w0);
tmp = _mm256_lddqu_si256 (in + 12);
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 16));
w0 = _mm256_srli_epi32(tmp,16);
_mm256_storeu_si256(compressed + 7, w1);
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 13) , 4));
tmp = _mm256_lddqu_si256 (in + 14);
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 24));
w1 = _mm256_srli_epi32(tmp,8);
_mm256_storeu_si256(compressed + 8, w0);
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 15) , 12));
_mm256_storeu_si256(compressed + 9, w1);
w0 = _mm256_lddqu_si256 (in + 16);
tmp = _mm256_lddqu_si256 (in + 17);
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 20));
w1 = _mm256_srli_epi32(tmp,12);
_mm256_storeu_si256(compressed + 10, w0);
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 18) , 8));
tmp = _mm256_lddqu_si256 (in + 19);
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 28));
w0 = _mm256_srli_epi32(tmp,4);
_mm256_storeu_si256(compressed + 11, w1);
tmp = _mm256_lddqu_si256 (in + 20);
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 16));
w1 = _mm256_srli_epi32(tmp,16);
_mm256_storeu_si256(compressed + 12, w0);
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 21) , 4));
tmp = _mm256_lddqu_si256 (in + 22);
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 24));
w0 = _mm256_srli_epi32(tmp,8);
_mm256_storeu_si256(compressed + 13, w1);
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 23) , 12));
_mm256_storeu_si256(compressed + 14, w0);
w1 = _mm256_lddqu_si256 (in + 24);
tmp = _mm256_lddqu_si256 (in + 25);
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 20));
w0 = _mm256_srli_epi32(tmp,12);
_mm256_storeu_si256(compressed + 15, w1);
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 26) , 8));
tmp = _mm256_lddqu_si256 (in + 27);
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 28));
w1 = _mm256_srli_epi32(tmp,4);
_mm256_storeu_si256(compressed + 16, w0);
tmp = _mm256_lddqu_si256 (in + 28);
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 16));
w0 = _mm256_srli_epi32(tmp,16);
_mm256_storeu_si256(compressed + 17, w1);
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 29) , 4));
tmp = _mm256_lddqu_si256 (in + 30);
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 24));
w1 = _mm256_srli_epi32(tmp,8);
_mm256_storeu_si256(compressed + 18, w0);
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 31) , 12));
_mm256_storeu_si256(compressed + 19, w1);
}
/* we are going to pack 256 21-bit values, touching 21 256-bit words, using 336 bytes */
static void avxpackblock21(const uint32_t * pin, __m256i * compressed) {
const __m256i * in = (const __m256i *) pin;
/* we are going to touch 21 256-bit words */
__m256i w0, w1;
__m256i tmp; /* used to store inputs at word boundary */
w0 = _mm256_lddqu_si256 (in + 0);
tmp = _mm256_lddqu_si256 (in + 1);
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 21));
w1 = _mm256_srli_epi32(tmp,11);
_mm256_storeu_si256(compressed + 0, w0);
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 2) , 10));
tmp = _mm256_lddqu_si256 (in + 3);
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 31));
w0 = _mm256_srli_epi32(tmp,1);
_mm256_storeu_si256(compressed + 1, w1);
tmp = _mm256_lddqu_si256 (in + 4);
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 20));
w1 = _mm256_srli_epi32(tmp,12);
_mm256_storeu_si256(compressed + 2, w0);
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 5) , 9));
tmp = _mm256_lddqu_si256 (in + 6);
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 30));
w0 = _mm256_srli_epi32(tmp,2);
_mm256_storeu_si256(compressed + 3, w1);
tmp = _mm256_lddqu_si256 (in + 7);
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 19));
w1 = _mm256_srli_epi32(tmp,13);
_mm256_storeu_si256(compressed + 4, w0);
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 8) , 8));
tmp = _mm256_lddqu_si256 (in + 9);
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 29));
w0 = _mm256_srli_epi32(tmp,3);
_mm256_storeu_si256(compressed + 5, w1);
tmp = _mm256_lddqu_si256 (in + 10);
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 18));
w1 = _mm256_srli_epi32(tmp,14);
_mm256_storeu_si256(compressed + 6, w0);
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 11) , 7));
tmp = _mm256_lddqu_si256 (in + 12);
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 28));
w0 = _mm256_srli_epi32(tmp,4);
_mm256_storeu_si256(compressed + 7, w1);
tmp = _mm256_lddqu_si256 (in + 13);
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 17));
w1 = _mm256_srli_epi32(tmp,15);
_mm256_storeu_si256(compressed + 8, w0);
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 14) , 6));
tmp = _mm256_lddqu_si256 (in + 15);
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 27));
w0 = _mm256_srli_epi32(tmp,5);
_mm256_storeu_si256(compressed + 9, w1);
tmp = _mm256_lddqu_si256 (in + 16);
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 16));
w1 = _mm256_srli_epi32(tmp,16);
_mm256_storeu_si256(compressed + 10, w0);
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 17) , 5));
tmp = _mm256_lddqu_si256 (in + 18);
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 26));
w0 = _mm256_srli_epi32(tmp,6);
_mm256_storeu_si256(compressed + 11, w1);
tmp = _mm256_lddqu_si256 (in + 19);
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 15));
w1 = _mm256_srli_epi32(tmp,17);
_mm256_storeu_si256(compressed + 12, w0);
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 20) , 4));
tmp = _mm256_lddqu_si256 (in + 21);
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 25));
w0 = _mm256_srli_epi32(tmp,7);
_mm256_storeu_si256(compressed + 13, w1);
tmp = _mm256_lddqu_si256 (in + 22);
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 14));
w1 = _mm256_srli_epi32(tmp,18);
_mm256_storeu_si256(compressed + 14, w0);
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 23) , 3));
tmp = _mm256_lddqu_si256 (in + 24);
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 24));
w0 = _mm256_srli_epi32(tmp,8);
_mm256_storeu_si256(compressed + 15, w1);
tmp = _mm256_lddqu_si256 (in + 25);
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 13));
w1 = _mm256_srli_epi32(tmp,19);
_mm256_storeu_si256(compressed + 16, w0);
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 26) , 2));
tmp = _mm256_lddqu_si256 (in + 27);
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 23));
w0 = _mm256_srli_epi32(tmp,9);
_mm256_storeu_si256(compressed + 17, w1);
tmp = _mm256_lddqu_si256 (in + 28);
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 12));
w1 = _mm256_srli_epi32(tmp,20);
_mm256_storeu_si256(compressed + 18, w0);
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 29) , 1));
tmp = _mm256_lddqu_si256 (in + 30);
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 22));
w0 = _mm256_srli_epi32(tmp,10);
_mm256_storeu_si256(compressed + 19, w1);
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 31) , 11));
_mm256_storeu_si256(compressed + 20, w0);
}
/* we are going to pack 256 22-bit values, touching 22 256-bit words, using 352 bytes */
static void avxpackblock22(const uint32_t * pin, __m256i * compressed) {
const __m256i * in = (const __m256i *) pin;
/* we are going to touch 22 256-bit words */
__m256i w0, w1;
__m256i tmp; /* used to store inputs at word boundary */
w0 = _mm256_lddqu_si256 (in + 0);
tmp = _mm256_lddqu_si256 (in + 1);
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 22));
w1 = _mm256_srli_epi32(tmp,10);
_mm256_storeu_si256(compressed + 0, w0);
tmp = _mm256_lddqu_si256 (in + 2);
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 12));
w0 = _mm256_srli_epi32(tmp,20);
_mm256_storeu_si256(compressed + 1, w1);
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 3) , 2));
tmp = _mm256_lddqu_si256 (in + 4);
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 24));
w1 = _mm256_srli_epi32(tmp,8);
_mm256_storeu_si256(compressed + 2, w0);
tmp = _mm256_lddqu_si256 (in + 5);
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 14));
w0 = _mm256_srli_epi32(tmp,18);
_mm256_storeu_si256(compressed + 3, w1);
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 6) , 4));
tmp = _mm256_lddqu_si256 (in + 7);
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 26));
w1 = _mm256_srli_epi32(tmp,6);
_mm256_storeu_si256(compressed + 4, w0);
tmp = _mm256_lddqu_si256 (in + 8);
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 16));
w0 = _mm256_srli_epi32(tmp,16);
_mm256_storeu_si256(compressed + 5, w1);
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 9) , 6));
tmp = _mm256_lddqu_si256 (in + 10);
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 28));
w1 = _mm256_srli_epi32(tmp,4);
_mm256_storeu_si256(compressed + 6, w0);
tmp = _mm256_lddqu_si256 (in + 11);
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 18));
w0 = _mm256_srli_epi32(tmp,14);
_mm256_storeu_si256(compressed + 7, w1);
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 12) , 8));
tmp = _mm256_lddqu_si256 (in + 13);
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 30));
w1 = _mm256_srli_epi32(tmp,2);
_mm256_storeu_si256(compressed + 8, w0);
tmp = _mm256_lddqu_si256 (in + 14);
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 20));
w0 = _mm256_srli_epi32(tmp,12);
_mm256_storeu_si256(compressed + 9, w1);
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 15) , 10));
_mm256_storeu_si256(compressed + 10, w0);
w1 = _mm256_lddqu_si256 (in + 16);
tmp = _mm256_lddqu_si256 (in + 17);
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 22));
w0 = _mm256_srli_epi32(tmp,10);
_mm256_storeu_si256(compressed + 11, w1);
tmp = _mm256_lddqu_si256 (in + 18);
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 12));
w1 = _mm256_srli_epi32(tmp,20);
_mm256_storeu_si256(compressed + 12, w0);
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 19) , 2));
tmp = _mm256_lddqu_si256 (in + 20);
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 24));
w0 = _mm256_srli_epi32(tmp,8);
_mm256_storeu_si256(compressed + 13, w1);
tmp = _mm256_lddqu_si256 (in + 21);
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 14));
w1 = _mm256_srli_epi32(tmp,18);
_mm256_storeu_si256(compressed + 14, w0);
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 22) , 4));
tmp = _mm256_lddqu_si256 (in + 23);
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 26));
w0 = _mm256_srli_epi32(tmp,6);
_mm256_storeu_si256(compressed + 15, w1);
tmp = _mm256_lddqu_si256 (in + 24);
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 16));
w1 = _mm256_srli_epi32(tmp,16);
_mm256_storeu_si256(compressed + 16, w0);
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 25) , 6));
tmp = _mm256_lddqu_si256 (in + 26);
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 28));
w0 = _mm256_srli_epi32(tmp,4);
_mm256_storeu_si256(compressed + 17, w1);
tmp = _mm256_lddqu_si256 (in + 27);
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 18));
w1 = _mm256_srli_epi32(tmp,14);
_mm256_storeu_si256(compressed + 18, w0);
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 28) , 8));
tmp = _mm256_lddqu_si256 (in + 29);
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 30));
w0 = _mm256_srli_epi32(tmp,2);
_mm256_storeu_si256(compressed + 19, w1);
tmp = _mm256_lddqu_si256 (in + 30);
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 20));
w1 = _mm256_srli_epi32(tmp,12);
_mm256_storeu_si256(compressed + 20, w0);
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 31) , 10));
_mm256_storeu_si256(compressed + 21, w1);
}
/* we are going to pack 256 23-bit values, touching 23 256-bit words, using 368 bytes */
static void avxpackblock23(const uint32_t * pin, __m256i * compressed) {
const __m256i * in = (const __m256i *) pin;
/* we are going to touch 23 256-bit words */
__m256i w0, w1;
__m256i tmp; /* used to store inputs at word boundary */
w0 = _mm256_lddqu_si256 (in + 0);
tmp = _mm256_lddqu_si256 (in + 1);
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 23));
w1 = _mm256_srli_epi32(tmp,9);
_mm256_storeu_si256(compressed + 0, w0);
tmp = _mm256_lddqu_si256 (in + 2);
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 14));
w0 = _mm256_srli_epi32(tmp,18);
_mm256_storeu_si256(compressed + 1, w1);
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 3) , 5));
tmp = _mm256_lddqu_si256 (in + 4);
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 28));
w1 = _mm256_srli_epi32(tmp,4);
_mm256_storeu_si256(compressed + 2, w0);
tmp = _mm256_lddqu_si256 (in + 5);
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 19));
w0 = _mm256_srli_epi32(tmp,13);
_mm256_storeu_si256(compressed + 3, w1);
tmp = _mm256_lddqu_si256 (in + 6);
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 10));
w1 = _mm256_srli_epi32(tmp,22);
_mm256_storeu_si256(compressed + 4, w0);
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 7) , 1));
tmp = _mm256_lddqu_si256 (in + 8);
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 24));
w0 = _mm256_srli_epi32(tmp,8);
_mm256_storeu_si256(compressed + 5, w1);
tmp = _mm256_lddqu_si256 (in + 9);
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 15));
w1 = _mm256_srli_epi32(tmp,17);
_mm256_storeu_si256(compressed + 6, w0);
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 10) , 6));
tmp = _mm256_lddqu_si256 (in + 11);
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 29));
w0 = _mm256_srli_epi32(tmp,3);
_mm256_storeu_si256(compressed + 7, w1);
tmp = _mm256_lddqu_si256 (in + 12);
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 20));
w1 = _mm256_srli_epi32(tmp,12);
_mm256_storeu_si256(compressed + 8, w0);
tmp = _mm256_lddqu_si256 (in + 13);
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 11));
w0 = _mm256_srli_epi32(tmp,21);
_mm256_storeu_si256(compressed + 9, w1);
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 14) , 2));
tmp = _mm256_lddqu_si256 (in + 15);
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 25));
w1 = _mm256_srli_epi32(tmp,7);
_mm256_storeu_si256(compressed + 10, w0);
tmp = _mm256_lddqu_si256 (in + 16);
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 16));
w0 = _mm256_srli_epi32(tmp,16);
_mm256_storeu_si256(compressed + 11, w1);
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 17) , 7));
tmp = _mm256_lddqu_si256 (in + 18);
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 30));
w1 = _mm256_srli_epi32(tmp,2);
_mm256_storeu_si256(compressed + 12, w0);
tmp = _mm256_lddqu_si256 (in + 19);
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 21));
w0 = _mm256_srli_epi32(tmp,11);
_mm256_storeu_si256(compressed + 13, w1);
tmp = _mm256_lddqu_si256 (in + 20);
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 12));
w1 = _mm256_srli_epi32(tmp,20);
_mm256_storeu_si256(compressed + 14, w0);
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 21) , 3));
tmp = _mm256_lddqu_si256 (in + 22);
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 26));
w0 = _mm256_srli_epi32(tmp,6);
_mm256_storeu_si256(compressed + 15, w1);
tmp = _mm256_lddqu_si256 (in + 23);
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 17));
w1 = _mm256_srli_epi32(tmp,15);
_mm256_storeu_si256(compressed + 16, w0);
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 24) , 8));
tmp = _mm256_lddqu_si256 (in + 25);
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 31));
w0 = _mm256_srli_epi32(tmp,1);
_mm256_storeu_si256(compressed + 17, w1);
tmp = _mm256_lddqu_si256 (in + 26);
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 22));
w1 = _mm256_srli_epi32(tmp,10);
_mm256_storeu_si256(compressed + 18, w0);
tmp = _mm256_lddqu_si256 (in + 27);
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 13));
w0 = _mm256_srli_epi32(tmp,19);
_mm256_storeu_si256(compressed + 19, w1);
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 28) , 4));
tmp = _mm256_lddqu_si256 (in + 29);
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 27));
w1 = _mm256_srli_epi32(tmp,5);
_mm256_storeu_si256(compressed + 20, w0);
tmp = _mm256_lddqu_si256 (in + 30);
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 18));
w0 = _mm256_srli_epi32(tmp,14);
_mm256_storeu_si256(compressed + 21, w1);
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 31) , 9));
_mm256_storeu_si256(compressed + 22, w0);
}
/* we are going to pack 256 24-bit values, touching 24 256-bit words, using 384 bytes */
static void avxpackblock24(const uint32_t * pin, __m256i * compressed) {
const __m256i * in = (const __m256i *) pin;
/* we are going to touch 24 256-bit words */
__m256i w0, w1;
__m256i tmp; /* used to store inputs at word boundary */
w0 = _mm256_lddqu_si256 (in + 0);
tmp = _mm256_lddqu_si256 (in + 1);
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 24));
w1 = _mm256_srli_epi32(tmp,8);
_mm256_storeu_si256(compressed + 0, w0);
tmp = _mm256_lddqu_si256 (in + 2);
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 16));
w0 = _mm256_srli_epi32(tmp,16);
_mm256_storeu_si256(compressed + 1, w1);
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 3) , 8));
_mm256_storeu_si256(compressed + 2, w0);
w1 = _mm256_lddqu_si256 (in + 4);
tmp = _mm256_lddqu_si256 (in + 5);
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 24));
w0 = _mm256_srli_epi32(tmp,8);
_mm256_storeu_si256(compressed + 3, w1);
tmp = _mm256_lddqu_si256 (in + 6);
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 16));
w1 = _mm256_srli_epi32(tmp,16);
_mm256_storeu_si256(compressed + 4, w0);
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 7) , 8));
_mm256_storeu_si256(compressed + 5, w1);
w0 = _mm256_lddqu_si256 (in + 8);
tmp = _mm256_lddqu_si256 (in + 9);
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 24));
w1 = _mm256_srli_epi32(tmp,8);
_mm256_storeu_si256(compressed + 6, w0);
tmp = _mm256_lddqu_si256 (in + 10);
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 16));
w0 = _mm256_srli_epi32(tmp,16);
_mm256_storeu_si256(compressed + 7, w1);
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 11) , 8));
_mm256_storeu_si256(compressed + 8, w0);
w1 = _mm256_lddqu_si256 (in + 12);
tmp = _mm256_lddqu_si256 (in + 13);
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 24));
w0 = _mm256_srli_epi32(tmp,8);
_mm256_storeu_si256(compressed + 9, w1);
tmp = _mm256_lddqu_si256 (in + 14);
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 16));
w1 = _mm256_srli_epi32(tmp,16);
_mm256_storeu_si256(compressed + 10, w0);
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 15) , 8));
_mm256_storeu_si256(compressed + 11, w1);
w0 = _mm256_lddqu_si256 (in + 16);
tmp = _mm256_lddqu_si256 (in + 17);
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 24));
w1 = _mm256_srli_epi32(tmp,8);
_mm256_storeu_si256(compressed + 12, w0);
tmp = _mm256_lddqu_si256 (in + 18);
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 16));
w0 = _mm256_srli_epi32(tmp,16);
_mm256_storeu_si256(compressed + 13, w1);
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 19) , 8));
_mm256_storeu_si256(compressed + 14, w0);
w1 = _mm256_lddqu_si256 (in + 20);
tmp = _mm256_lddqu_si256 (in + 21);
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 24));
w0 = _mm256_srli_epi32(tmp,8);
_mm256_storeu_si256(compressed + 15, w1);
tmp = _mm256_lddqu_si256 (in + 22);
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 16));
w1 = _mm256_srli_epi32(tmp,16);
_mm256_storeu_si256(compressed + 16, w0);
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 23) , 8));
_mm256_storeu_si256(compressed + 17, w1);
w0 = _mm256_lddqu_si256 (in + 24);
tmp = _mm256_lddqu_si256 (in + 25);
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 24));
w1 = _mm256_srli_epi32(tmp,8);
_mm256_storeu_si256(compressed + 18, w0);
tmp = _mm256_lddqu_si256 (in + 26);
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 16));
w0 = _mm256_srli_epi32(tmp,16);
_mm256_storeu_si256(compressed + 19, w1);
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 27) , 8));
_mm256_storeu_si256(compressed + 20, w0);
w1 = _mm256_lddqu_si256 (in + 28);
tmp = _mm256_lddqu_si256 (in + 29);
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 24));
w0 = _mm256_srli_epi32(tmp,8);
_mm256_storeu_si256(compressed + 21, w1);
tmp = _mm256_lddqu_si256 (in + 30);
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 16));
w1 = _mm256_srli_epi32(tmp,16);
_mm256_storeu_si256(compressed + 22, w0);
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 31) , 8));
_mm256_storeu_si256(compressed + 23, w1);
}
/* we are going to pack 256 25-bit values, touching 25 256-bit words, using 400 bytes */
static void avxpackblock25(const uint32_t * pin, __m256i * compressed) {
const __m256i * in = (const __m256i *) pin;
/* we are going to touch 25 256-bit words */
__m256i w0, w1;
__m256i tmp; /* used to store inputs at word boundary */
w0 = _mm256_lddqu_si256 (in + 0);
tmp = _mm256_lddqu_si256 (in + 1);
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 25));
w1 = _mm256_srli_epi32(tmp,7);
_mm256_storeu_si256(compressed + 0, w0);
tmp = _mm256_lddqu_si256 (in + 2);
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 18));
w0 = _mm256_srli_epi32(tmp,14);
_mm256_storeu_si256(compressed + 1, w1);
tmp = _mm256_lddqu_si256 (in + 3);
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 11));
w1 = _mm256_srli_epi32(tmp,21);
_mm256_storeu_si256(compressed + 2, w0);
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 4) , 4));
tmp = _mm256_lddqu_si256 (in + 5);
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 29));
w0 = _mm256_srli_epi32(tmp,3);
_mm256_storeu_si256(compressed + 3, w1);
tmp = _mm256_lddqu_si256 (in + 6);
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 22));
w1 = _mm256_srli_epi32(tmp,10);
_mm256_storeu_si256(compressed + 4, w0);
tmp = _mm256_lddqu_si256 (in + 7);
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 15));
w0 = _mm256_srli_epi32(tmp,17);
_mm256_storeu_si256(compressed + 5, w1);
tmp = _mm256_lddqu_si256 (in + 8);
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 8));
w1 = _mm256_srli_epi32(tmp,24);
_mm256_storeu_si256(compressed + 6, w0);
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 9) , 1));
tmp = _mm256_lddqu_si256 (in + 10);
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 26));
w0 = _mm256_srli_epi32(tmp,6);
_mm256_storeu_si256(compressed + 7, w1);
tmp = _mm256_lddqu_si256 (in + 11);
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 19));
w1 = _mm256_srli_epi32(tmp,13);
_mm256_storeu_si256(compressed + 8, w0);
tmp = _mm256_lddqu_si256 (in + 12);
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 12));
w0 = _mm256_srli_epi32(tmp,20);
_mm256_storeu_si256(compressed + 9, w1);
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 13) , 5));
tmp = _mm256_lddqu_si256 (in + 14);
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 30));
w1 = _mm256_srli_epi32(tmp,2);
_mm256_storeu_si256(compressed + 10, w0);
tmp = _mm256_lddqu_si256 (in + 15);
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 23));
w0 = _mm256_srli_epi32(tmp,9);
_mm256_storeu_si256(compressed + 11, w1);
tmp = _mm256_lddqu_si256 (in + 16);
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 16));
w1 = _mm256_srli_epi32(tmp,16);
_mm256_storeu_si256(compressed + 12, w0);
tmp = _mm256_lddqu_si256 (in + 17);
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 9));
w0 = _mm256_srli_epi32(tmp,23);
_mm256_storeu_si256(compressed + 13, w1);
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 18) , 2));
tmp = _mm256_lddqu_si256 (in + 19);
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 27));
w1 = _mm256_srli_epi32(tmp,5);
_mm256_storeu_si256(compressed + 14, w0);
tmp = _mm256_lddqu_si256 (in + 20);
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 20));
w0 = _mm256_srli_epi32(tmp,12);
_mm256_storeu_si256(compressed + 15, w1);
tmp = _mm256_lddqu_si256 (in + 21);
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 13));
w1 = _mm256_srli_epi32(tmp,19);
_mm256_storeu_si256(compressed + 16, w0);
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 22) , 6));
tmp = _mm256_lddqu_si256 (in + 23);
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 31));
w0 = _mm256_srli_epi32(tmp,1);
_mm256_storeu_si256(compressed + 17, w1);
tmp = _mm256_lddqu_si256 (in + 24);
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 24));
w1 = _mm256_srli_epi32(tmp,8);
_mm256_storeu_si256(compressed + 18, w0);
tmp = _mm256_lddqu_si256 (in + 25);
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 17));
w0 = _mm256_srli_epi32(tmp,15);
_mm256_storeu_si256(compressed + 19, w1);
tmp = _mm256_lddqu_si256 (in + 26);
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 10));
w1 = _mm256_srli_epi32(tmp,22);
_mm256_storeu_si256(compressed + 20, w0);
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 27) , 3));
tmp = _mm256_lddqu_si256 (in + 28);
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 28));
w0 = _mm256_srli_epi32(tmp,4);
_mm256_storeu_si256(compressed + 21, w1);
tmp = _mm256_lddqu_si256 (in + 29);
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 21));
w1 = _mm256_srli_epi32(tmp,11);
_mm256_storeu_si256(compressed + 22, w0);
tmp = _mm256_lddqu_si256 (in + 30);
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 14));
w0 = _mm256_srli_epi32(tmp,18);
_mm256_storeu_si256(compressed + 23, w1);
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 31) , 7));
_mm256_storeu_si256(compressed + 24, w0);
}
/* we are going to pack 256 26-bit values, touching 26 256-bit words, using 416 bytes */
static void avxpackblock26(const uint32_t * pin, __m256i * compressed) {
const __m256i * in = (const __m256i *) pin;
/* we are going to touch 26 256-bit words */
__m256i w0, w1;
__m256i tmp; /* used to store inputs at word boundary */
w0 = _mm256_lddqu_si256 (in + 0);
tmp = _mm256_lddqu_si256 (in + 1);
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 26));
w1 = _mm256_srli_epi32(tmp,6);
_mm256_storeu_si256(compressed + 0, w0);
tmp = _mm256_lddqu_si256 (in + 2);
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 20));
w0 = _mm256_srli_epi32(tmp,12);
_mm256_storeu_si256(compressed + 1, w1);
tmp = _mm256_lddqu_si256 (in + 3);
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 14));
w1 = _mm256_srli_epi32(tmp,18);
_mm256_storeu_si256(compressed + 2, w0);
tmp = _mm256_lddqu_si256 (in + 4);
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 8));
w0 = _mm256_srli_epi32(tmp,24);
_mm256_storeu_si256(compressed + 3, w1);
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 5) , 2));
tmp = _mm256_lddqu_si256 (in + 6);
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 28));
w1 = _mm256_srli_epi32(tmp,4);
_mm256_storeu_si256(compressed + 4, w0);
tmp = _mm256_lddqu_si256 (in + 7);
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 22));
w0 = _mm256_srli_epi32(tmp,10);
_mm256_storeu_si256(compressed + 5, w1);
tmp = _mm256_lddqu_si256 (in + 8);
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 16));
w1 = _mm256_srli_epi32(tmp,16);
_mm256_storeu_si256(compressed + 6, w0);
tmp = _mm256_lddqu_si256 (in + 9);
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 10));
w0 = _mm256_srli_epi32(tmp,22);
_mm256_storeu_si256(compressed + 7, w1);
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 10) , 4));
tmp = _mm256_lddqu_si256 (in + 11);
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 30));
w1 = _mm256_srli_epi32(tmp,2);
_mm256_storeu_si256(compressed + 8, w0);
tmp = _mm256_lddqu_si256 (in + 12);
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 24));
w0 = _mm256_srli_epi32(tmp,8);
_mm256_storeu_si256(compressed + 9, w1);
tmp = _mm256_lddqu_si256 (in + 13);
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 18));
w1 = _mm256_srli_epi32(tmp,14);
_mm256_storeu_si256(compressed + 10, w0);
tmp = _mm256_lddqu_si256 (in + 14);
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 12));
w0 = _mm256_srli_epi32(tmp,20);
_mm256_storeu_si256(compressed + 11, w1);
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 15) , 6));
_mm256_storeu_si256(compressed + 12, w0);
w1 = _mm256_lddqu_si256 (in + 16);
tmp = _mm256_lddqu_si256 (in + 17);
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 26));
w0 = _mm256_srli_epi32(tmp,6);
_mm256_storeu_si256(compressed + 13, w1);
tmp = _mm256_lddqu_si256 (in + 18);
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 20));
w1 = _mm256_srli_epi32(tmp,12);
_mm256_storeu_si256(compressed + 14, w0);
tmp = _mm256_lddqu_si256 (in + 19);
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 14));
w0 = _mm256_srli_epi32(tmp,18);
_mm256_storeu_si256(compressed + 15, w1);
tmp = _mm256_lddqu_si256 (in + 20);
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 8));
w1 = _mm256_srli_epi32(tmp,24);
_mm256_storeu_si256(compressed + 16, w0);
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 21) , 2));
tmp = _mm256_lddqu_si256 (in + 22);
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 28));
w0 = _mm256_srli_epi32(tmp,4);
_mm256_storeu_si256(compressed + 17, w1);
tmp = _mm256_lddqu_si256 (in + 23);
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 22));
w1 = _mm256_srli_epi32(tmp,10);
_mm256_storeu_si256(compressed + 18, w0);
tmp = _mm256_lddqu_si256 (in + 24);
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 16));
w0 = _mm256_srli_epi32(tmp,16);
_mm256_storeu_si256(compressed + 19, w1);
tmp = _mm256_lddqu_si256 (in + 25);
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 10));
w1 = _mm256_srli_epi32(tmp,22);
_mm256_storeu_si256(compressed + 20, w0);
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 26) , 4));
tmp = _mm256_lddqu_si256 (in + 27);
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 30));
w0 = _mm256_srli_epi32(tmp,2);
_mm256_storeu_si256(compressed + 21, w1);
tmp = _mm256_lddqu_si256 (in + 28);
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 24));
w1 = _mm256_srli_epi32(tmp,8);
_mm256_storeu_si256(compressed + 22, w0);
tmp = _mm256_lddqu_si256 (in + 29);
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 18));
w0 = _mm256_srli_epi32(tmp,14);
_mm256_storeu_si256(compressed + 23, w1);
tmp = _mm256_lddqu_si256 (in + 30);
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 12));
w1 = _mm256_srli_epi32(tmp,20);
_mm256_storeu_si256(compressed + 24, w0);
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 31) , 6));
_mm256_storeu_si256(compressed + 25, w1);
}
/* we are going to pack 256 27-bit values, touching 27 256-bit words, using 432 bytes */
static void avxpackblock27(const uint32_t * pin, __m256i * compressed) {
const __m256i * in = (const __m256i *) pin;
/* we are going to touch 27 256-bit words */
__m256i w0, w1;
__m256i tmp; /* used to store inputs at word boundary */
w0 = _mm256_lddqu_si256 (in + 0);
tmp = _mm256_lddqu_si256 (in + 1);
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 27));
w1 = _mm256_srli_epi32(tmp,5);
_mm256_storeu_si256(compressed + 0, w0);
tmp = _mm256_lddqu_si256 (in + 2);
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 22));
w0 = _mm256_srli_epi32(tmp,10);
_mm256_storeu_si256(compressed + 1, w1);
tmp = _mm256_lddqu_si256 (in + 3);
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 17));
w1 = _mm256_srli_epi32(tmp,15);
_mm256_storeu_si256(compressed + 2, w0);
tmp = _mm256_lddqu_si256 (in + 4);
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 12));
w0 = _mm256_srli_epi32(tmp,20);
_mm256_storeu_si256(compressed + 3, w1);
tmp = _mm256_lddqu_si256 (in + 5);
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 7));
w1 = _mm256_srli_epi32(tmp,25);
_mm256_storeu_si256(compressed + 4, w0);
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 6) , 2));
tmp = _mm256_lddqu_si256 (in + 7);
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 29));
w0 = _mm256_srli_epi32(tmp,3);
_mm256_storeu_si256(compressed + 5, w1);
tmp = _mm256_lddqu_si256 (in + 8);
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 24));
w1 = _mm256_srli_epi32(tmp,8);
_mm256_storeu_si256(compressed + 6, w0);
tmp = _mm256_lddqu_si256 (in + 9);
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 19));
w0 = _mm256_srli_epi32(tmp,13);
_mm256_storeu_si256(compressed + 7, w1);
tmp = _mm256_lddqu_si256 (in + 10);
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 14));
w1 = _mm256_srli_epi32(tmp,18);
_mm256_storeu_si256(compressed + 8, w0);
tmp = _mm256_lddqu_si256 (in + 11);
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 9));
w0 = _mm256_srli_epi32(tmp,23);
_mm256_storeu_si256(compressed + 9, w1);
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 12) , 4));
tmp = _mm256_lddqu_si256 (in + 13);
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 31));
w1 = _mm256_srli_epi32(tmp,1);
_mm256_storeu_si256(compressed + 10, w0);
tmp = _mm256_lddqu_si256 (in + 14);
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 26));
w0 = _mm256_srli_epi32(tmp,6);
_mm256_storeu_si256(compressed + 11, w1);
tmp = _mm256_lddqu_si256 (in + 15);
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 21));
w1 = _mm256_srli_epi32(tmp,11);
_mm256_storeu_si256(compressed + 12, w0);
tmp = _mm256_lddqu_si256 (in + 16);
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 16));
w0 = _mm256_srli_epi32(tmp,16);
_mm256_storeu_si256(compressed + 13, w1);
tmp = _mm256_lddqu_si256 (in + 17);
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 11));
w1 = _mm256_srli_epi32(tmp,21);
_mm256_storeu_si256(compressed + 14, w0);
tmp = _mm256_lddqu_si256 (in + 18);
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 6));
w0 = _mm256_srli_epi32(tmp,26);
_mm256_storeu_si256(compressed + 15, w1);
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 19) , 1));
tmp = _mm256_lddqu_si256 (in + 20);
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 28));
w1 = _mm256_srli_epi32(tmp,4);
_mm256_storeu_si256(compressed + 16, w0);
tmp = _mm256_lddqu_si256 (in + 21);
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 23));
w0 = _mm256_srli_epi32(tmp,9);
_mm256_storeu_si256(compressed + 17, w1);
tmp = _mm256_lddqu_si256 (in + 22);
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 18));
w1 = _mm256_srli_epi32(tmp,14);
_mm256_storeu_si256(compressed + 18, w0);
tmp = _mm256_lddqu_si256 (in + 23);
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 13));
w0 = _mm256_srli_epi32(tmp,19);
_mm256_storeu_si256(compressed + 19, w1);
tmp = _mm256_lddqu_si256 (in + 24);
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 8));
w1 = _mm256_srli_epi32(tmp,24);
_mm256_storeu_si256(compressed + 20, w0);
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 25) , 3));
tmp = _mm256_lddqu_si256 (in + 26);
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 30));
w0 = _mm256_srli_epi32(tmp,2);
_mm256_storeu_si256(compressed + 21, w1);
tmp = _mm256_lddqu_si256 (in + 27);
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 25));
w1 = _mm256_srli_epi32(tmp,7);
_mm256_storeu_si256(compressed + 22, w0);
tmp = _mm256_lddqu_si256 (in + 28);
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 20));
w0 = _mm256_srli_epi32(tmp,12);
_mm256_storeu_si256(compressed + 23, w1);
tmp = _mm256_lddqu_si256 (in + 29);
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 15));
w1 = _mm256_srli_epi32(tmp,17);
_mm256_storeu_si256(compressed + 24, w0);
tmp = _mm256_lddqu_si256 (in + 30);
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 10));
w0 = _mm256_srli_epi32(tmp,22);
_mm256_storeu_si256(compressed + 25, w1);
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 31) , 5));
_mm256_storeu_si256(compressed + 26, w0);
}
/* we are going to pack 256 28-bit values, touching 28 256-bit words, using 448 bytes */
static void avxpackblock28(const uint32_t * pin, __m256i * compressed) {
const __m256i * in = (const __m256i *) pin;
/* we are going to touch 28 256-bit words */
__m256i w0, w1;
__m256i tmp; /* used to store inputs at word boundary */
w0 = _mm256_lddqu_si256 (in + 0);
tmp = _mm256_lddqu_si256 (in + 1);
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 28));
w1 = _mm256_srli_epi32(tmp,4);
_mm256_storeu_si256(compressed + 0, w0);
tmp = _mm256_lddqu_si256 (in + 2);
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 24));
w0 = _mm256_srli_epi32(tmp,8);
_mm256_storeu_si256(compressed + 1, w1);
tmp = _mm256_lddqu_si256 (in + 3);
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 20));
w1 = _mm256_srli_epi32(tmp,12);
_mm256_storeu_si256(compressed + 2, w0);
tmp = _mm256_lddqu_si256 (in + 4);
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 16));
w0 = _mm256_srli_epi32(tmp,16);
_mm256_storeu_si256(compressed + 3, w1);
tmp = _mm256_lddqu_si256 (in + 5);
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 12));
w1 = _mm256_srli_epi32(tmp,20);
_mm256_storeu_si256(compressed + 4, w0);
tmp = _mm256_lddqu_si256 (in + 6);
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 8));
w0 = _mm256_srli_epi32(tmp,24);
_mm256_storeu_si256(compressed + 5, w1);
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 7) , 4));
_mm256_storeu_si256(compressed + 6, w0);
w1 = _mm256_lddqu_si256 (in + 8);
tmp = _mm256_lddqu_si256 (in + 9);
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 28));
w0 = _mm256_srli_epi32(tmp,4);
_mm256_storeu_si256(compressed + 7, w1);
tmp = _mm256_lddqu_si256 (in + 10);
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 24));
w1 = _mm256_srli_epi32(tmp,8);
_mm256_storeu_si256(compressed + 8, w0);
tmp = _mm256_lddqu_si256 (in + 11);
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 20));
w0 = _mm256_srli_epi32(tmp,12);
_mm256_storeu_si256(compressed + 9, w1);
tmp = _mm256_lddqu_si256 (in + 12);
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 16));
w1 = _mm256_srli_epi32(tmp,16);
_mm256_storeu_si256(compressed + 10, w0);
tmp = _mm256_lddqu_si256 (in + 13);
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 12));
w0 = _mm256_srli_epi32(tmp,20);
_mm256_storeu_si256(compressed + 11, w1);
tmp = _mm256_lddqu_si256 (in + 14);
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 8));
w1 = _mm256_srli_epi32(tmp,24);
_mm256_storeu_si256(compressed + 12, w0);
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 15) , 4));
_mm256_storeu_si256(compressed + 13, w1);
w0 = _mm256_lddqu_si256 (in + 16);
tmp = _mm256_lddqu_si256 (in + 17);
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 28));
w1 = _mm256_srli_epi32(tmp,4);
_mm256_storeu_si256(compressed + 14, w0);
tmp = _mm256_lddqu_si256 (in + 18);
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 24));
w0 = _mm256_srli_epi32(tmp,8);
_mm256_storeu_si256(compressed + 15, w1);
tmp = _mm256_lddqu_si256 (in + 19);
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 20));
w1 = _mm256_srli_epi32(tmp,12);
_mm256_storeu_si256(compressed + 16, w0);
tmp = _mm256_lddqu_si256 (in + 20);
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 16));
w0 = _mm256_srli_epi32(tmp,16);
_mm256_storeu_si256(compressed + 17, w1);
tmp = _mm256_lddqu_si256 (in + 21);
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 12));
w1 = _mm256_srli_epi32(tmp,20);
_mm256_storeu_si256(compressed + 18, w0);
tmp = _mm256_lddqu_si256 (in + 22);
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 8));
w0 = _mm256_srli_epi32(tmp,24);
_mm256_storeu_si256(compressed + 19, w1);
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 23) , 4));
_mm256_storeu_si256(compressed + 20, w0);
w1 = _mm256_lddqu_si256 (in + 24);
tmp = _mm256_lddqu_si256 (in + 25);
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 28));
w0 = _mm256_srli_epi32(tmp,4);
_mm256_storeu_si256(compressed + 21, w1);
tmp = _mm256_lddqu_si256 (in + 26);
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 24));
w1 = _mm256_srli_epi32(tmp,8);
_mm256_storeu_si256(compressed + 22, w0);
tmp = _mm256_lddqu_si256 (in + 27);
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 20));
w0 = _mm256_srli_epi32(tmp,12);
_mm256_storeu_si256(compressed + 23, w1);
tmp = _mm256_lddqu_si256 (in + 28);
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 16));
w1 = _mm256_srli_epi32(tmp,16);
_mm256_storeu_si256(compressed + 24, w0);
tmp = _mm256_lddqu_si256 (in + 29);
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 12));
w0 = _mm256_srli_epi32(tmp,20);
_mm256_storeu_si256(compressed + 25, w1);
tmp = _mm256_lddqu_si256 (in + 30);
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 8));
w1 = _mm256_srli_epi32(tmp,24);
_mm256_storeu_si256(compressed + 26, w0);
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 31) , 4));
_mm256_storeu_si256(compressed + 27, w1);
}
/* we are going to pack 256 29-bit values, touching 29 256-bit words, using 464 bytes */
static void avxpackblock29(const uint32_t * pin, __m256i * compressed) {
const __m256i * in = (const __m256i *) pin;
/* we are going to touch 29 256-bit words */
__m256i w0, w1;
__m256i tmp; /* used to store inputs at word boundary */
w0 = _mm256_lddqu_si256 (in + 0);
tmp = _mm256_lddqu_si256 (in + 1);
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 29));
w1 = _mm256_srli_epi32(tmp,3);
_mm256_storeu_si256(compressed + 0, w0);
tmp = _mm256_lddqu_si256 (in + 2);
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 26));
w0 = _mm256_srli_epi32(tmp,6);
_mm256_storeu_si256(compressed + 1, w1);
tmp = _mm256_lddqu_si256 (in + 3);
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 23));
w1 = _mm256_srli_epi32(tmp,9);
_mm256_storeu_si256(compressed + 2, w0);
tmp = _mm256_lddqu_si256 (in + 4);
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 20));
w0 = _mm256_srli_epi32(tmp,12);
_mm256_storeu_si256(compressed + 3, w1);
tmp = _mm256_lddqu_si256 (in + 5);
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 17));
w1 = _mm256_srli_epi32(tmp,15);
_mm256_storeu_si256(compressed + 4, w0);
tmp = _mm256_lddqu_si256 (in + 6);
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 14));
w0 = _mm256_srli_epi32(tmp,18);
_mm256_storeu_si256(compressed + 5, w1);
tmp = _mm256_lddqu_si256 (in + 7);
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 11));
w1 = _mm256_srli_epi32(tmp,21);
_mm256_storeu_si256(compressed + 6, w0);
tmp = _mm256_lddqu_si256 (in + 8);
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 8));
w0 = _mm256_srli_epi32(tmp,24);
_mm256_storeu_si256(compressed + 7, w1);
tmp = _mm256_lddqu_si256 (in + 9);
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 5));
w1 = _mm256_srli_epi32(tmp,27);
_mm256_storeu_si256(compressed + 8, w0);
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 10) , 2));
tmp = _mm256_lddqu_si256 (in + 11);
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 31));
w0 = _mm256_srli_epi32(tmp,1);
_mm256_storeu_si256(compressed + 9, w1);
tmp = _mm256_lddqu_si256 (in + 12);
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 28));
w1 = _mm256_srli_epi32(tmp,4);
_mm256_storeu_si256(compressed + 10, w0);
tmp = _mm256_lddqu_si256 (in + 13);
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 25));
w0 = _mm256_srli_epi32(tmp,7);
_mm256_storeu_si256(compressed + 11, w1);
tmp = _mm256_lddqu_si256 (in + 14);
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 22));
w1 = _mm256_srli_epi32(tmp,10);
_mm256_storeu_si256(compressed + 12, w0);
tmp = _mm256_lddqu_si256 (in + 15);
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 19));
w0 = _mm256_srli_epi32(tmp,13);
_mm256_storeu_si256(compressed + 13, w1);
tmp = _mm256_lddqu_si256 (in + 16);
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 16));
w1 = _mm256_srli_epi32(tmp,16);
_mm256_storeu_si256(compressed + 14, w0);
tmp = _mm256_lddqu_si256 (in + 17);
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 13));
w0 = _mm256_srli_epi32(tmp,19);
_mm256_storeu_si256(compressed + 15, w1);
tmp = _mm256_lddqu_si256 (in + 18);
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 10));
w1 = _mm256_srli_epi32(tmp,22);
_mm256_storeu_si256(compressed + 16, w0);
tmp = _mm256_lddqu_si256 (in + 19);
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 7));
w0 = _mm256_srli_epi32(tmp,25);
_mm256_storeu_si256(compressed + 17, w1);
tmp = _mm256_lddqu_si256 (in + 20);
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 4));
w1 = _mm256_srli_epi32(tmp,28);
_mm256_storeu_si256(compressed + 18, w0);
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 21) , 1));
tmp = _mm256_lddqu_si256 (in + 22);
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 30));
w0 = _mm256_srli_epi32(tmp,2);
_mm256_storeu_si256(compressed + 19, w1);
tmp = _mm256_lddqu_si256 (in + 23);
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 27));
w1 = _mm256_srli_epi32(tmp,5);
_mm256_storeu_si256(compressed + 20, w0);
tmp = _mm256_lddqu_si256 (in + 24);
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 24));
w0 = _mm256_srli_epi32(tmp,8);
_mm256_storeu_si256(compressed + 21, w1);
tmp = _mm256_lddqu_si256 (in + 25);
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 21));
w1 = _mm256_srli_epi32(tmp,11);
_mm256_storeu_si256(compressed + 22, w0);
tmp = _mm256_lddqu_si256 (in + 26);
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 18));
w0 = _mm256_srli_epi32(tmp,14);
_mm256_storeu_si256(compressed + 23, w1);
tmp = _mm256_lddqu_si256 (in + 27);
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 15));
w1 = _mm256_srli_epi32(tmp,17);
_mm256_storeu_si256(compressed + 24, w0);
tmp = _mm256_lddqu_si256 (in + 28);
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 12));
w0 = _mm256_srli_epi32(tmp,20);
_mm256_storeu_si256(compressed + 25, w1);
tmp = _mm256_lddqu_si256 (in + 29);
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 9));
w1 = _mm256_srli_epi32(tmp,23);
_mm256_storeu_si256(compressed + 26, w0);
tmp = _mm256_lddqu_si256 (in + 30);
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 6));
w0 = _mm256_srli_epi32(tmp,26);
_mm256_storeu_si256(compressed + 27, w1);
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 31) , 3));
_mm256_storeu_si256(compressed + 28, w0);
}
/* we are going to pack 256 30-bit values, touching 30 256-bit words, using 480 bytes */
static void avxpackblock30(const uint32_t * pin, __m256i * compressed) {
const __m256i * in = (const __m256i *) pin;
/* we are going to touch 30 256-bit words */
__m256i w0, w1;
__m256i tmp; /* used to store inputs at word boundary */
w0 = _mm256_lddqu_si256 (in + 0);
tmp = _mm256_lddqu_si256 (in + 1);
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 30));
w1 = _mm256_srli_epi32(tmp,2);
_mm256_storeu_si256(compressed + 0, w0);
tmp = _mm256_lddqu_si256 (in + 2);
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 28));
w0 = _mm256_srli_epi32(tmp,4);
_mm256_storeu_si256(compressed + 1, w1);
tmp = _mm256_lddqu_si256 (in + 3);
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 26));
w1 = _mm256_srli_epi32(tmp,6);
_mm256_storeu_si256(compressed + 2, w0);
tmp = _mm256_lddqu_si256 (in + 4);
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 24));
w0 = _mm256_srli_epi32(tmp,8);
_mm256_storeu_si256(compressed + 3, w1);
tmp = _mm256_lddqu_si256 (in + 5);
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 22));
w1 = _mm256_srli_epi32(tmp,10);
_mm256_storeu_si256(compressed + 4, w0);
tmp = _mm256_lddqu_si256 (in + 6);
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 20));
w0 = _mm256_srli_epi32(tmp,12);
_mm256_storeu_si256(compressed + 5, w1);
tmp = _mm256_lddqu_si256 (in + 7);
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 18));
w1 = _mm256_srli_epi32(tmp,14);
_mm256_storeu_si256(compressed + 6, w0);
tmp = _mm256_lddqu_si256 (in + 8);
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 16));
w0 = _mm256_srli_epi32(tmp,16);
_mm256_storeu_si256(compressed + 7, w1);
tmp = _mm256_lddqu_si256 (in + 9);
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 14));
w1 = _mm256_srli_epi32(tmp,18);
_mm256_storeu_si256(compressed + 8, w0);
tmp = _mm256_lddqu_si256 (in + 10);
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 12));
w0 = _mm256_srli_epi32(tmp,20);
_mm256_storeu_si256(compressed + 9, w1);
tmp = _mm256_lddqu_si256 (in + 11);
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 10));
w1 = _mm256_srli_epi32(tmp,22);
_mm256_storeu_si256(compressed + 10, w0);
tmp = _mm256_lddqu_si256 (in + 12);
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 8));
w0 = _mm256_srli_epi32(tmp,24);
_mm256_storeu_si256(compressed + 11, w1);
tmp = _mm256_lddqu_si256 (in + 13);
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 6));
w1 = _mm256_srli_epi32(tmp,26);
_mm256_storeu_si256(compressed + 12, w0);
tmp = _mm256_lddqu_si256 (in + 14);
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 4));
w0 = _mm256_srli_epi32(tmp,28);
_mm256_storeu_si256(compressed + 13, w1);
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 15) , 2));
_mm256_storeu_si256(compressed + 14, w0);
w1 = _mm256_lddqu_si256 (in + 16);
tmp = _mm256_lddqu_si256 (in + 17);
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 30));
w0 = _mm256_srli_epi32(tmp,2);
_mm256_storeu_si256(compressed + 15, w1);
tmp = _mm256_lddqu_si256 (in + 18);
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 28));
w1 = _mm256_srli_epi32(tmp,4);
_mm256_storeu_si256(compressed + 16, w0);
tmp = _mm256_lddqu_si256 (in + 19);
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 26));
w0 = _mm256_srli_epi32(tmp,6);
_mm256_storeu_si256(compressed + 17, w1);
tmp = _mm256_lddqu_si256 (in + 20);
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 24));
w1 = _mm256_srli_epi32(tmp,8);
_mm256_storeu_si256(compressed + 18, w0);
tmp = _mm256_lddqu_si256 (in + 21);
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 22));
w0 = _mm256_srli_epi32(tmp,10);
_mm256_storeu_si256(compressed + 19, w1);
tmp = _mm256_lddqu_si256 (in + 22);
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 20));
w1 = _mm256_srli_epi32(tmp,12);
_mm256_storeu_si256(compressed + 20, w0);
tmp = _mm256_lddqu_si256 (in + 23);
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 18));
w0 = _mm256_srli_epi32(tmp,14);
_mm256_storeu_si256(compressed + 21, w1);
tmp = _mm256_lddqu_si256 (in + 24);
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 16));
w1 = _mm256_srli_epi32(tmp,16);
_mm256_storeu_si256(compressed + 22, w0);
tmp = _mm256_lddqu_si256 (in + 25);
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 14));
w0 = _mm256_srli_epi32(tmp,18);
_mm256_storeu_si256(compressed + 23, w1);
tmp = _mm256_lddqu_si256 (in + 26);
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 12));
w1 = _mm256_srli_epi32(tmp,20);
_mm256_storeu_si256(compressed + 24, w0);
tmp = _mm256_lddqu_si256 (in + 27);
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 10));
w0 = _mm256_srli_epi32(tmp,22);
_mm256_storeu_si256(compressed + 25, w1);
tmp = _mm256_lddqu_si256 (in + 28);
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 8));
w1 = _mm256_srli_epi32(tmp,24);
_mm256_storeu_si256(compressed + 26, w0);
tmp = _mm256_lddqu_si256 (in + 29);
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 6));
w0 = _mm256_srli_epi32(tmp,26);
_mm256_storeu_si256(compressed + 27, w1);
tmp = _mm256_lddqu_si256 (in + 30);
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 4));
w1 = _mm256_srli_epi32(tmp,28);
_mm256_storeu_si256(compressed + 28, w0);
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 31) , 2));
_mm256_storeu_si256(compressed + 29, w1);
}
/* we are going to pack 256 31-bit values, touching 31 256-bit words, using 496 bytes */
static void avxpackblock31(const uint32_t * pin, __m256i * compressed) {
const __m256i * in = (const __m256i *) pin;
/* we are going to touch 31 256-bit words */
__m256i w0, w1;
__m256i tmp; /* used to store inputs at word boundary */
w0 = _mm256_lddqu_si256 (in + 0);
tmp = _mm256_lddqu_si256 (in + 1);
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 31));
w1 = _mm256_srli_epi32(tmp,1);
_mm256_storeu_si256(compressed + 0, w0);
tmp = _mm256_lddqu_si256 (in + 2);
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 30));
w0 = _mm256_srli_epi32(tmp,2);
_mm256_storeu_si256(compressed + 1, w1);
tmp = _mm256_lddqu_si256 (in + 3);
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 29));
w1 = _mm256_srli_epi32(tmp,3);
_mm256_storeu_si256(compressed + 2, w0);
tmp = _mm256_lddqu_si256 (in + 4);
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 28));
w0 = _mm256_srli_epi32(tmp,4);
_mm256_storeu_si256(compressed + 3, w1);
tmp = _mm256_lddqu_si256 (in + 5);
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 27));
w1 = _mm256_srli_epi32(tmp,5);
_mm256_storeu_si256(compressed + 4, w0);
tmp = _mm256_lddqu_si256 (in + 6);
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 26));
w0 = _mm256_srli_epi32(tmp,6);
_mm256_storeu_si256(compressed + 5, w1);
tmp = _mm256_lddqu_si256 (in + 7);
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 25));
w1 = _mm256_srli_epi32(tmp,7);
_mm256_storeu_si256(compressed + 6, w0);
tmp = _mm256_lddqu_si256 (in + 8);
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 24));
w0 = _mm256_srli_epi32(tmp,8);
_mm256_storeu_si256(compressed + 7, w1);
tmp = _mm256_lddqu_si256 (in + 9);
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 23));
w1 = _mm256_srli_epi32(tmp,9);
_mm256_storeu_si256(compressed + 8, w0);
tmp = _mm256_lddqu_si256 (in + 10);
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 22));
w0 = _mm256_srli_epi32(tmp,10);
_mm256_storeu_si256(compressed + 9, w1);
tmp = _mm256_lddqu_si256 (in + 11);
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 21));
w1 = _mm256_srli_epi32(tmp,11);
_mm256_storeu_si256(compressed + 10, w0);
tmp = _mm256_lddqu_si256 (in + 12);
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 20));
w0 = _mm256_srli_epi32(tmp,12);
_mm256_storeu_si256(compressed + 11, w1);
tmp = _mm256_lddqu_si256 (in + 13);
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 19));
w1 = _mm256_srli_epi32(tmp,13);
_mm256_storeu_si256(compressed + 12, w0);
tmp = _mm256_lddqu_si256 (in + 14);
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 18));
w0 = _mm256_srli_epi32(tmp,14);
_mm256_storeu_si256(compressed + 13, w1);
tmp = _mm256_lddqu_si256 (in + 15);
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 17));
w1 = _mm256_srli_epi32(tmp,15);
_mm256_storeu_si256(compressed + 14, w0);
tmp = _mm256_lddqu_si256 (in + 16);
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 16));
w0 = _mm256_srli_epi32(tmp,16);
_mm256_storeu_si256(compressed + 15, w1);
tmp = _mm256_lddqu_si256 (in + 17);
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 15));
w1 = _mm256_srli_epi32(tmp,17);
_mm256_storeu_si256(compressed + 16, w0);
tmp = _mm256_lddqu_si256 (in + 18);
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 14));
w0 = _mm256_srli_epi32(tmp,18);
_mm256_storeu_si256(compressed + 17, w1);
tmp = _mm256_lddqu_si256 (in + 19);
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 13));
w1 = _mm256_srli_epi32(tmp,19);
_mm256_storeu_si256(compressed + 18, w0);
tmp = _mm256_lddqu_si256 (in + 20);
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 12));
w0 = _mm256_srli_epi32(tmp,20);
_mm256_storeu_si256(compressed + 19, w1);
tmp = _mm256_lddqu_si256 (in + 21);
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 11));
w1 = _mm256_srli_epi32(tmp,21);
_mm256_storeu_si256(compressed + 20, w0);
tmp = _mm256_lddqu_si256 (in + 22);
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 10));
w0 = _mm256_srli_epi32(tmp,22);
_mm256_storeu_si256(compressed + 21, w1);
tmp = _mm256_lddqu_si256 (in + 23);
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 9));
w1 = _mm256_srli_epi32(tmp,23);
_mm256_storeu_si256(compressed + 22, w0);
tmp = _mm256_lddqu_si256 (in + 24);
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 8));
w0 = _mm256_srli_epi32(tmp,24);
_mm256_storeu_si256(compressed + 23, w1);
tmp = _mm256_lddqu_si256 (in + 25);
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 7));
w1 = _mm256_srli_epi32(tmp,25);
_mm256_storeu_si256(compressed + 24, w0);
tmp = _mm256_lddqu_si256 (in + 26);
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 6));
w0 = _mm256_srli_epi32(tmp,26);
_mm256_storeu_si256(compressed + 25, w1);
tmp = _mm256_lddqu_si256 (in + 27);
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 5));
w1 = _mm256_srli_epi32(tmp,27);
_mm256_storeu_si256(compressed + 26, w0);
tmp = _mm256_lddqu_si256 (in + 28);
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 4));
w0 = _mm256_srli_epi32(tmp,28);
_mm256_storeu_si256(compressed + 27, w1);
tmp = _mm256_lddqu_si256 (in + 29);
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 3));
w1 = _mm256_srli_epi32(tmp,29);
_mm256_storeu_si256(compressed + 28, w0);
tmp = _mm256_lddqu_si256 (in + 30);
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 2));
w0 = _mm256_srli_epi32(tmp,30);
_mm256_storeu_si256(compressed + 29, w1);
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 31) , 1));
_mm256_storeu_si256(compressed + 30, w0);
}
/* we are going to pack 256 32-bit values, touching 32 256-bit words, using 512 bytes */
static void avxpackblock32(const uint32_t * pin, __m256i * compressed) {
const __m256i * in = (const __m256i *) pin;
/* we are going to touch 32 256-bit words */
__m256i w0, w1;
w0 = _mm256_lddqu_si256 (in + 0);
_mm256_storeu_si256(compressed + 0, w0);
w1 = _mm256_lddqu_si256 (in + 1);
_mm256_storeu_si256(compressed + 1, w1);
w0 = _mm256_lddqu_si256 (in + 2);
_mm256_storeu_si256(compressed + 2, w0);
w1 = _mm256_lddqu_si256 (in + 3);
_mm256_storeu_si256(compressed + 3, w1);
w0 = _mm256_lddqu_si256 (in + 4);
_mm256_storeu_si256(compressed + 4, w0);
w1 = _mm256_lddqu_si256 (in + 5);
_mm256_storeu_si256(compressed + 5, w1);
w0 = _mm256_lddqu_si256 (in + 6);
_mm256_storeu_si256(compressed + 6, w0);
w1 = _mm256_lddqu_si256 (in + 7);
_mm256_storeu_si256(compressed + 7, w1);
w0 = _mm256_lddqu_si256 (in + 8);
_mm256_storeu_si256(compressed + 8, w0);
w1 = _mm256_lddqu_si256 (in + 9);
_mm256_storeu_si256(compressed + 9, w1);
w0 = _mm256_lddqu_si256 (in + 10);
_mm256_storeu_si256(compressed + 10, w0);
w1 = _mm256_lddqu_si256 (in + 11);
_mm256_storeu_si256(compressed + 11, w1);
w0 = _mm256_lddqu_si256 (in + 12);
_mm256_storeu_si256(compressed + 12, w0);
w1 = _mm256_lddqu_si256 (in + 13);
_mm256_storeu_si256(compressed + 13, w1);
w0 = _mm256_lddqu_si256 (in + 14);
_mm256_storeu_si256(compressed + 14, w0);
w1 = _mm256_lddqu_si256 (in + 15);
_mm256_storeu_si256(compressed + 15, w1);
w0 = _mm256_lddqu_si256 (in + 16);
_mm256_storeu_si256(compressed + 16, w0);
w1 = _mm256_lddqu_si256 (in + 17);
_mm256_storeu_si256(compressed + 17, w1);
w0 = _mm256_lddqu_si256 (in + 18);
_mm256_storeu_si256(compressed + 18, w0);
w1 = _mm256_lddqu_si256 (in + 19);
_mm256_storeu_si256(compressed + 19, w1);
w0 = _mm256_lddqu_si256 (in + 20);
_mm256_storeu_si256(compressed + 20, w0);
w1 = _mm256_lddqu_si256 (in + 21);
_mm256_storeu_si256(compressed + 21, w1);
w0 = _mm256_lddqu_si256 (in + 22);
_mm256_storeu_si256(compressed + 22, w0);
w1 = _mm256_lddqu_si256 (in + 23);
_mm256_storeu_si256(compressed + 23, w1);
w0 = _mm256_lddqu_si256 (in + 24);
_mm256_storeu_si256(compressed + 24, w0);
w1 = _mm256_lddqu_si256 (in + 25);
_mm256_storeu_si256(compressed + 25, w1);
w0 = _mm256_lddqu_si256 (in + 26);
_mm256_storeu_si256(compressed + 26, w0);
w1 = _mm256_lddqu_si256 (in + 27);
_mm256_storeu_si256(compressed + 27, w1);
w0 = _mm256_lddqu_si256 (in + 28);
_mm256_storeu_si256(compressed + 28, w0);
w1 = _mm256_lddqu_si256 (in + 29);
_mm256_storeu_si256(compressed + 29, w1);
w0 = _mm256_lddqu_si256 (in + 30);
_mm256_storeu_si256(compressed + 30, w0);
w1 = _mm256_lddqu_si256 (in + 31);
_mm256_storeu_si256(compressed + 31, w1);
}
static void avxpackblockmask0(const uint32_t * pin, __m256i * compressed) {
(void)compressed;
(void) pin; /* we consumed 256 32-bit integers */
}
/* we are going to pack 256 1-bit values, touching 1 256-bit words, using 16 bytes */
static void avxpackblockmask1(const uint32_t * pin, __m256i * compressed) {
/* we are going to touch 1 256-bit word */
__m256i w0;
const __m256i * in = (const __m256i *) pin;
const __m256i mask = _mm256_set1_epi32(1);
w0 = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 0) ) ;
w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 1) ) , 1));
w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 2) ) , 2));
w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 3) ) , 3));
w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 4) ) , 4));
w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 5) ) , 5));
w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 6) ) , 6));
w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 7) ) , 7));
w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 8) ) , 8));
w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 9) ) , 9));
w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 10) ) , 10));
w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 11) ) , 11));
w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 12) ) , 12));
w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 13) ) , 13));
w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 14) ) , 14));
w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 15) ) , 15));
w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 16) ) , 16));
w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 17) ) , 17));
w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 18) ) , 18));
w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 19) ) , 19));
w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 20) ) , 20));
w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 21) ) , 21));
w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 22) ) , 22));
w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 23) ) , 23));
w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 24) ) , 24));
w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 25) ) , 25));
w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 26) ) , 26));
w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 27) ) , 27));
w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 28) ) , 28));
w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 29) ) , 29));
w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 30) ) , 30));
w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 31) ) , 31));
_mm256_storeu_si256(compressed + 0, w0);
}
/* we are going to pack 256 2-bit values, touching 2 256-bit words, using 32 bytes */
static void avxpackblockmask2(const uint32_t * pin, __m256i * compressed) {
/* we are going to touch 2 256-bit words */
__m256i w0, w1;
const __m256i * in = (const __m256i *) pin;
const __m256i mask = _mm256_set1_epi32(3);
w0 = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 0) ) ;
w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 1) ) , 2));
w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 2) ) , 4));
w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 3) ) , 6));
w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 4) ) , 8));
w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 5) ) , 10));
w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 6) ) , 12));
w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 7) ) , 14));
w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 8) ) , 16));
w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 9) ) , 18));
w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 10) ) , 20));
w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 11) ) , 22));
w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 12) ) , 24));
w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 13) ) , 26));
w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 14) ) , 28));
w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 15) ) , 30));
_mm256_storeu_si256(compressed + 0, w0);
w1 = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 16) ) ;
w1 = _mm256_or_si256(w1,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 17) ) , 2));
w1 = _mm256_or_si256(w1,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 18) ) , 4));
w1 = _mm256_or_si256(w1,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 19) ) , 6));
w1 = _mm256_or_si256(w1,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 20) ) , 8));
w1 = _mm256_or_si256(w1,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 21) ) , 10));
w1 = _mm256_or_si256(w1,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 22) ) , 12));
w1 = _mm256_or_si256(w1,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 23) ) , 14));
w1 = _mm256_or_si256(w1,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 24) ) , 16));
w1 = _mm256_or_si256(w1,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 25) ) , 18));
w1 = _mm256_or_si256(w1,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 26) ) , 20));
w1 = _mm256_or_si256(w1,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 27) ) , 22));
w1 = _mm256_or_si256(w1,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 28) ) , 24));
w1 = _mm256_or_si256(w1,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 29) ) , 26));
w1 = _mm256_or_si256(w1,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 30) ) , 28));
w1 = _mm256_or_si256(w1,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 31) ) , 30));
_mm256_storeu_si256(compressed + 1, w1);
}
/* we are going to pack 256 3-bit values, touching 3 256-bit words, using 48 bytes */
static void avxpackblockmask3(const uint32_t * pin, __m256i * compressed) {
/* we are going to touch 3 256-bit words */
__m256i w0, w1;
const __m256i * in = (const __m256i *) pin;
const __m256i mask = _mm256_set1_epi32(7);
__m256i tmp; /* used to store inputs at word boundary */
w0 = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 0) ) ;
w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 1) ) , 3));
w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 2) ) , 6));
w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 3) ) , 9));
w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 4) ) , 12));
w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 5) ) , 15));
w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 6) ) , 18));
w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 7) ) , 21));
w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 8) ) , 24));
w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 9) ) , 27));
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 10) ) ;
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 30));
w1 = _mm256_srli_epi32(tmp,2);
_mm256_storeu_si256(compressed + 0, w0);
w1 = _mm256_or_si256(w1,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 11) ) , 1));
w1 = _mm256_or_si256(w1,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 12) ) , 4));
w1 = _mm256_or_si256(w1,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 13) ) , 7));
w1 = _mm256_or_si256(w1,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 14) ) , 10));
w1 = _mm256_or_si256(w1,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 15) ) , 13));
w1 = _mm256_or_si256(w1,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 16) ) , 16));
w1 = _mm256_or_si256(w1,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 17) ) , 19));
w1 = _mm256_or_si256(w1,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 18) ) , 22));
w1 = _mm256_or_si256(w1,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 19) ) , 25));
w1 = _mm256_or_si256(w1,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 20) ) , 28));
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 21) ) ;
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 31));
w0 = _mm256_srli_epi32(tmp,1);
_mm256_storeu_si256(compressed + 1, w1);
w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 22) ) , 2));
w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 23) ) , 5));
w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 24) ) , 8));
w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 25) ) , 11));
w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 26) ) , 14));
w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 27) ) , 17));
w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 28) ) , 20));
w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 29) ) , 23));
w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 30) ) , 26));
w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 31) ) , 29));
_mm256_storeu_si256(compressed + 2, w0);
}
/* we are going to pack 256 4-bit values, touching 4 256-bit words, using 64 bytes */
static void avxpackblockmask4(const uint32_t * pin, __m256i * compressed) {
/* we are going to touch 4 256-bit words */
__m256i w0, w1;
const __m256i * in = (const __m256i *) pin;
const __m256i mask = _mm256_set1_epi32(15);
w0 = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 0) ) ;
w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 1) ) , 4));
w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 2) ) , 8));
w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 3) ) , 12));
w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 4) ) , 16));
w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 5) ) , 20));
w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 6) ) , 24));
w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 7) ) , 28));
_mm256_storeu_si256(compressed + 0, w0);
w1 = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 8) ) ;
w1 = _mm256_or_si256(w1,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 9) ) , 4));
w1 = _mm256_or_si256(w1,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 10) ) , 8));
w1 = _mm256_or_si256(w1,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 11) ) , 12));
w1 = _mm256_or_si256(w1,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 12) ) , 16));
w1 = _mm256_or_si256(w1,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 13) ) , 20));
w1 = _mm256_or_si256(w1,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 14) ) , 24));
w1 = _mm256_or_si256(w1,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 15) ) , 28));
_mm256_storeu_si256(compressed + 1, w1);
w0 = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 16) ) ;
w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 17) ) , 4));
w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 18) ) , 8));
w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 19) ) , 12));
w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 20) ) , 16));
w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 21) ) , 20));
w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 22) ) , 24));
w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 23) ) , 28));
_mm256_storeu_si256(compressed + 2, w0);
w1 = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 24) ) ;
w1 = _mm256_or_si256(w1,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 25) ) , 4));
w1 = _mm256_or_si256(w1,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 26) ) , 8));
w1 = _mm256_or_si256(w1,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 27) ) , 12));
w1 = _mm256_or_si256(w1,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 28) ) , 16));
w1 = _mm256_or_si256(w1,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 29) ) , 20));
w1 = _mm256_or_si256(w1,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 30) ) , 24));
w1 = _mm256_or_si256(w1,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 31) ) , 28));
_mm256_storeu_si256(compressed + 3, w1);
}
/* we are going to pack 256 5-bit values, touching 5 256-bit words, using 80 bytes */
static void avxpackblockmask5(const uint32_t * pin, __m256i * compressed) {
/* we are going to touch 5 256-bit words */
__m256i w0, w1;
const __m256i * in = (const __m256i *) pin;
const __m256i mask = _mm256_set1_epi32(31);
__m256i tmp; /* used to store inputs at word boundary */
w0 = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 0) ) ;
w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 1) ) , 5));
w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 2) ) , 10));
w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 3) ) , 15));
w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 4) ) , 20));
w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 5) ) , 25));
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 6) ) ;
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 30));
w1 = _mm256_srli_epi32(tmp,2);
_mm256_storeu_si256(compressed + 0, w0);
w1 = _mm256_or_si256(w1,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 7) ) , 3));
w1 = _mm256_or_si256(w1,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 8) ) , 8));
w1 = _mm256_or_si256(w1,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 9) ) , 13));
w1 = _mm256_or_si256(w1,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 10) ) , 18));
w1 = _mm256_or_si256(w1,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 11) ) , 23));
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 12) ) ;
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 28));
w0 = _mm256_srli_epi32(tmp,4);
_mm256_storeu_si256(compressed + 1, w1);
w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 13) ) , 1));
w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 14) ) , 6));
w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 15) ) , 11));
w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 16) ) , 16));
w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 17) ) , 21));
w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 18) ) , 26));
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 19) ) ;
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 31));
w1 = _mm256_srli_epi32(tmp,1);
_mm256_storeu_si256(compressed + 2, w0);
w1 = _mm256_or_si256(w1,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 20) ) , 4));
w1 = _mm256_or_si256(w1,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 21) ) , 9));
w1 = _mm256_or_si256(w1,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 22) ) , 14));
w1 = _mm256_or_si256(w1,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 23) ) , 19));
w1 = _mm256_or_si256(w1,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 24) ) , 24));
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 25) ) ;
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 29));
w0 = _mm256_srli_epi32(tmp,3);
_mm256_storeu_si256(compressed + 3, w1);
w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 26) ) , 2));
w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 27) ) , 7));
w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 28) ) , 12));
w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 29) ) , 17));
w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 30) ) , 22));
w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 31) ) , 27));
_mm256_storeu_si256(compressed + 4, w0);
}
/* we are going to pack 256 6-bit values, touching 6 256-bit words, using 96 bytes */
static void avxpackblockmask6(const uint32_t * pin, __m256i * compressed) {
/* we are going to touch 6 256-bit words */
__m256i w0, w1;
const __m256i * in = (const __m256i *) pin;
const __m256i mask = _mm256_set1_epi32(63);
__m256i tmp; /* used to store inputs at word boundary */
w0 = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 0) ) ;
w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 1) ) , 6));
w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 2) ) , 12));
w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 3) ) , 18));
w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 4) ) , 24));
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 5) ) ;
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 30));
w1 = _mm256_srli_epi32(tmp,2);
_mm256_storeu_si256(compressed + 0, w0);
w1 = _mm256_or_si256(w1,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 6) ) , 4));
w1 = _mm256_or_si256(w1,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 7) ) , 10));
w1 = _mm256_or_si256(w1,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 8) ) , 16));
w1 = _mm256_or_si256(w1,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 9) ) , 22));
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 10) ) ;
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 28));
w0 = _mm256_srli_epi32(tmp,4);
_mm256_storeu_si256(compressed + 1, w1);
w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 11) ) , 2));
w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 12) ) , 8));
w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 13) ) , 14));
w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 14) ) , 20));
w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 15) ) , 26));
_mm256_storeu_si256(compressed + 2, w0);
w1 = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 16) ) ;
w1 = _mm256_or_si256(w1,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 17) ) , 6));
w1 = _mm256_or_si256(w1,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 18) ) , 12));
w1 = _mm256_or_si256(w1,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 19) ) , 18));
w1 = _mm256_or_si256(w1,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 20) ) , 24));
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 21) ) ;
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 30));
w0 = _mm256_srli_epi32(tmp,2);
_mm256_storeu_si256(compressed + 3, w1);
w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 22) ) , 4));
w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 23) ) , 10));
w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 24) ) , 16));
w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 25) ) , 22));
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 26) ) ;
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 28));
w1 = _mm256_srli_epi32(tmp,4);
_mm256_storeu_si256(compressed + 4, w0);
w1 = _mm256_or_si256(w1,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 27) ) , 2));
w1 = _mm256_or_si256(w1,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 28) ) , 8));
w1 = _mm256_or_si256(w1,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 29) ) , 14));
w1 = _mm256_or_si256(w1,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 30) ) , 20));
w1 = _mm256_or_si256(w1,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 31) ) , 26));
_mm256_storeu_si256(compressed + 5, w1);
}
/* we are going to pack 256 7-bit values, touching 7 256-bit words, using 112 bytes */
static void avxpackblockmask7(const uint32_t * pin, __m256i * compressed) {
/* we are going to touch 7 256-bit words */
__m256i w0, w1;
const __m256i * in = (const __m256i *) pin;
const __m256i mask = _mm256_set1_epi32(127);
__m256i tmp; /* used to store inputs at word boundary */
w0 = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 0) ) ;
w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 1) ) , 7));
w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 2) ) , 14));
w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 3) ) , 21));
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 4) ) ;
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 28));
w1 = _mm256_srli_epi32(tmp,4);
_mm256_storeu_si256(compressed + 0, w0);
w1 = _mm256_or_si256(w1,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 5) ) , 3));
w1 = _mm256_or_si256(w1,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 6) ) , 10));
w1 = _mm256_or_si256(w1,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 7) ) , 17));
w1 = _mm256_or_si256(w1,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 8) ) , 24));
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 9) ) ;
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 31));
w0 = _mm256_srli_epi32(tmp,1);
_mm256_storeu_si256(compressed + 1, w1);
w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 10) ) , 6));
w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 11) ) , 13));
w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 12) ) , 20));
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 13) ) ;
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 27));
w1 = _mm256_srli_epi32(tmp,5);
_mm256_storeu_si256(compressed + 2, w0);
w1 = _mm256_or_si256(w1,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 14) ) , 2));
w1 = _mm256_or_si256(w1,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 15) ) , 9));
w1 = _mm256_or_si256(w1,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 16) ) , 16));
w1 = _mm256_or_si256(w1,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 17) ) , 23));
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 18) ) ;
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 30));
w0 = _mm256_srli_epi32(tmp,2);
_mm256_storeu_si256(compressed + 3, w1);
w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 19) ) , 5));
w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 20) ) , 12));
w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 21) ) , 19));
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 22) ) ;
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 26));
w1 = _mm256_srli_epi32(tmp,6);
_mm256_storeu_si256(compressed + 4, w0);
w1 = _mm256_or_si256(w1,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 23) ) , 1));
w1 = _mm256_or_si256(w1,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 24) ) , 8));
w1 = _mm256_or_si256(w1,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 25) ) , 15));
w1 = _mm256_or_si256(w1,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 26) ) , 22));
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 27) ) ;
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 29));
w0 = _mm256_srli_epi32(tmp,3);
_mm256_storeu_si256(compressed + 5, w1);
w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 28) ) , 4));
w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 29) ) , 11));
w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 30) ) , 18));
w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 31) ) , 25));
_mm256_storeu_si256(compressed + 6, w0);
}
/* we are going to pack 256 8-bit values, touching 8 256-bit words, using 128 bytes */
static void avxpackblockmask8(const uint32_t * pin, __m256i * compressed) {
/* we are going to touch 8 256-bit words */
__m256i w0, w1;
const __m256i * in = (const __m256i *) pin;
const __m256i mask = _mm256_set1_epi32(255);
w0 = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 0) ) ;
w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 1) ) , 8));
w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 2) ) , 16));
w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 3) ) , 24));
_mm256_storeu_si256(compressed + 0, w0);
w1 = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 4) ) ;
w1 = _mm256_or_si256(w1,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 5) ) , 8));
w1 = _mm256_or_si256(w1,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 6) ) , 16));
w1 = _mm256_or_si256(w1,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 7) ) , 24));
_mm256_storeu_si256(compressed + 1, w1);
w0 = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 8) ) ;
w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 9) ) , 8));
w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 10) ) , 16));
w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 11) ) , 24));
_mm256_storeu_si256(compressed + 2, w0);
w1 = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 12) ) ;
w1 = _mm256_or_si256(w1,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 13) ) , 8));
w1 = _mm256_or_si256(w1,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 14) ) , 16));
w1 = _mm256_or_si256(w1,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 15) ) , 24));
_mm256_storeu_si256(compressed + 3, w1);
w0 = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 16) ) ;
w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 17) ) , 8));
w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 18) ) , 16));
w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 19) ) , 24));
_mm256_storeu_si256(compressed + 4, w0);
w1 = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 20) ) ;
w1 = _mm256_or_si256(w1,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 21) ) , 8));
w1 = _mm256_or_si256(w1,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 22) ) , 16));
w1 = _mm256_or_si256(w1,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 23) ) , 24));
_mm256_storeu_si256(compressed + 5, w1);
w0 = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 24) ) ;
w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 25) ) , 8));
w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 26) ) , 16));
w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 27) ) , 24));
_mm256_storeu_si256(compressed + 6, w0);
w1 = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 28) ) ;
w1 = _mm256_or_si256(w1,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 29) ) , 8));
w1 = _mm256_or_si256(w1,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 30) ) , 16));
w1 = _mm256_or_si256(w1,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 31) ) , 24));
_mm256_storeu_si256(compressed + 7, w1);
}
/* we are going to pack 256 9-bit values, touching 9 256-bit words, using 144 bytes */
static void avxpackblockmask9(const uint32_t * pin, __m256i * compressed) {
/* we are going to touch 9 256-bit words */
__m256i w0, w1;
const __m256i * in = (const __m256i *) pin;
const __m256i mask = _mm256_set1_epi32(511);
__m256i tmp; /* used to store inputs at word boundary */
w0 = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 0) ) ;
w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 1) ) , 9));
w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 2) ) , 18));
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 3) ) ;
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 27));
w1 = _mm256_srli_epi32(tmp,5);
_mm256_storeu_si256(compressed + 0, w0);
w1 = _mm256_or_si256(w1,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 4) ) , 4));
w1 = _mm256_or_si256(w1,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 5) ) , 13));
w1 = _mm256_or_si256(w1,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 6) ) , 22));
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 7) ) ;
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 31));
w0 = _mm256_srli_epi32(tmp,1);
_mm256_storeu_si256(compressed + 1, w1);
w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 8) ) , 8));
w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 9) ) , 17));
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 10) ) ;
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 26));
w1 = _mm256_srli_epi32(tmp,6);
_mm256_storeu_si256(compressed + 2, w0);
w1 = _mm256_or_si256(w1,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 11) ) , 3));
w1 = _mm256_or_si256(w1,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 12) ) , 12));
w1 = _mm256_or_si256(w1,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 13) ) , 21));
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 14) ) ;
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 30));
w0 = _mm256_srli_epi32(tmp,2);
_mm256_storeu_si256(compressed + 3, w1);
w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 15) ) , 7));
w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 16) ) , 16));
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 17) ) ;
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 25));
w1 = _mm256_srli_epi32(tmp,7);
_mm256_storeu_si256(compressed + 4, w0);
w1 = _mm256_or_si256(w1,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 18) ) , 2));
w1 = _mm256_or_si256(w1,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 19) ) , 11));
w1 = _mm256_or_si256(w1,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 20) ) , 20));
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 21) ) ;
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 29));
w0 = _mm256_srli_epi32(tmp,3);
_mm256_storeu_si256(compressed + 5, w1);
w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 22) ) , 6));
w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 23) ) , 15));
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 24) ) ;
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 24));
w1 = _mm256_srli_epi32(tmp,8);
_mm256_storeu_si256(compressed + 6, w0);
w1 = _mm256_or_si256(w1,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 25) ) , 1));
w1 = _mm256_or_si256(w1,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 26) ) , 10));
w1 = _mm256_or_si256(w1,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 27) ) , 19));
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 28) ) ;
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 28));
w0 = _mm256_srli_epi32(tmp,4);
_mm256_storeu_si256(compressed + 7, w1);
w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 29) ) , 5));
w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 30) ) , 14));
w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 31) ) , 23));
_mm256_storeu_si256(compressed + 8, w0);
}
/* we are going to pack 256 10-bit values, touching 10 256-bit words, using 160 bytes */
static void avxpackblockmask10(const uint32_t * pin, __m256i * compressed) {
/* we are going to touch 10 256-bit words */
__m256i w0, w1;
const __m256i * in = (const __m256i *) pin;
const __m256i mask = _mm256_set1_epi32(1023);
__m256i tmp; /* used to store inputs at word boundary */
w0 = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 0) ) ;
w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 1) ) , 10));
w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 2) ) , 20));
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 3) ) ;
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 30));
w1 = _mm256_srli_epi32(tmp,2);
_mm256_storeu_si256(compressed + 0, w0);
w1 = _mm256_or_si256(w1,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 4) ) , 8));
w1 = _mm256_or_si256(w1,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 5) ) , 18));
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 6) ) ;
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 28));
w0 = _mm256_srli_epi32(tmp,4);
_mm256_storeu_si256(compressed + 1, w1);
w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 7) ) , 6));
w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 8) ) , 16));
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 9) ) ;
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 26));
w1 = _mm256_srli_epi32(tmp,6);
_mm256_storeu_si256(compressed + 2, w0);
w1 = _mm256_or_si256(w1,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 10) ) , 4));
w1 = _mm256_or_si256(w1,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 11) ) , 14));
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 12) ) ;
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 24));
w0 = _mm256_srli_epi32(tmp,8);
_mm256_storeu_si256(compressed + 3, w1);
w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 13) ) , 2));
w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 14) ) , 12));
w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 15) ) , 22));
_mm256_storeu_si256(compressed + 4, w0);
w1 = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 16) ) ;
w1 = _mm256_or_si256(w1,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 17) ) , 10));
w1 = _mm256_or_si256(w1,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 18) ) , 20));
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 19) ) ;
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 30));
w0 = _mm256_srli_epi32(tmp,2);
_mm256_storeu_si256(compressed + 5, w1);
w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 20) ) , 8));
w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 21) ) , 18));
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 22) ) ;
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 28));
w1 = _mm256_srli_epi32(tmp,4);
_mm256_storeu_si256(compressed + 6, w0);
w1 = _mm256_or_si256(w1,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 23) ) , 6));
w1 = _mm256_or_si256(w1,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 24) ) , 16));
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 25) ) ;
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 26));
w0 = _mm256_srli_epi32(tmp,6);
_mm256_storeu_si256(compressed + 7, w1);
w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 26) ) , 4));
w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 27) ) , 14));
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 28) ) ;
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 24));
w1 = _mm256_srli_epi32(tmp,8);
_mm256_storeu_si256(compressed + 8, w0);
w1 = _mm256_or_si256(w1,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 29) ) , 2));
w1 = _mm256_or_si256(w1,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 30) ) , 12));
w1 = _mm256_or_si256(w1,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 31) ) , 22));
_mm256_storeu_si256(compressed + 9, w1);
}
/* we are going to pack 256 11-bit values, touching 11 256-bit words, using 176 bytes */
static void avxpackblockmask11(const uint32_t * pin, __m256i * compressed) {
/* we are going to touch 11 256-bit words */
__m256i w0, w1;
const __m256i * in = (const __m256i *) pin;
const __m256i mask = _mm256_set1_epi32(2047);
__m256i tmp; /* used to store inputs at word boundary */
w0 = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 0) ) ;
w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 1) ) , 11));
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 2) ) ;
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 22));
w1 = _mm256_srli_epi32(tmp,10);
_mm256_storeu_si256(compressed + 0, w0);
w1 = _mm256_or_si256(w1,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 3) ) , 1));
w1 = _mm256_or_si256(w1,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 4) ) , 12));
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 5) ) ;
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 23));
w0 = _mm256_srli_epi32(tmp,9);
_mm256_storeu_si256(compressed + 1, w1);
w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 6) ) , 2));
w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 7) ) , 13));
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 8) ) ;
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 24));
w1 = _mm256_srli_epi32(tmp,8);
_mm256_storeu_si256(compressed + 2, w0);
w1 = _mm256_or_si256(w1,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 9) ) , 3));
w1 = _mm256_or_si256(w1,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 10) ) , 14));
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 11) ) ;
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 25));
w0 = _mm256_srli_epi32(tmp,7);
_mm256_storeu_si256(compressed + 3, w1);
w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 12) ) , 4));
w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 13) ) , 15));
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 14) ) ;
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 26));
w1 = _mm256_srli_epi32(tmp,6);
_mm256_storeu_si256(compressed + 4, w0);
w1 = _mm256_or_si256(w1,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 15) ) , 5));
w1 = _mm256_or_si256(w1,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 16) ) , 16));
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 17) ) ;
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 27));
w0 = _mm256_srli_epi32(tmp,5);
_mm256_storeu_si256(compressed + 5, w1);
w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 18) ) , 6));
w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 19) ) , 17));
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 20) ) ;
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 28));
w1 = _mm256_srli_epi32(tmp,4);
_mm256_storeu_si256(compressed + 6, w0);
w1 = _mm256_or_si256(w1,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 21) ) , 7));
w1 = _mm256_or_si256(w1,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 22) ) , 18));
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 23) ) ;
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 29));
w0 = _mm256_srli_epi32(tmp,3);
_mm256_storeu_si256(compressed + 7, w1);
w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 24) ) , 8));
w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 25) ) , 19));
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 26) ) ;
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 30));
w1 = _mm256_srli_epi32(tmp,2);
_mm256_storeu_si256(compressed + 8, w0);
w1 = _mm256_or_si256(w1,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 27) ) , 9));
w1 = _mm256_or_si256(w1,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 28) ) , 20));
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 29) ) ;
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 31));
w0 = _mm256_srli_epi32(tmp,1);
_mm256_storeu_si256(compressed + 9, w1);
w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 30) ) , 10));
w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 31) ) , 21));
_mm256_storeu_si256(compressed + 10, w0);
}
/* we are going to pack 256 12-bit values, touching 12 256-bit words, using 192 bytes */
static void avxpackblockmask12(const uint32_t * pin, __m256i * compressed) {
/* we are going to touch 12 256-bit words */
__m256i w0, w1;
const __m256i * in = (const __m256i *) pin;
const __m256i mask = _mm256_set1_epi32(4095);
__m256i tmp; /* used to store inputs at word boundary */
w0 = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 0) ) ;
w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 1) ) , 12));
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 2) ) ;
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 24));
w1 = _mm256_srli_epi32(tmp,8);
_mm256_storeu_si256(compressed + 0, w0);
w1 = _mm256_or_si256(w1,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 3) ) , 4));
w1 = _mm256_or_si256(w1,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 4) ) , 16));
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 5) ) ;
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 28));
w0 = _mm256_srli_epi32(tmp,4);
_mm256_storeu_si256(compressed + 1, w1);
w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 6) ) , 8));
w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 7) ) , 20));
_mm256_storeu_si256(compressed + 2, w0);
w1 = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 8) ) ;
w1 = _mm256_or_si256(w1,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 9) ) , 12));
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 10) ) ;
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 24));
w0 = _mm256_srli_epi32(tmp,8);
_mm256_storeu_si256(compressed + 3, w1);
w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 11) ) , 4));
w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 12) ) , 16));
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 13) ) ;
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 28));
w1 = _mm256_srli_epi32(tmp,4);
_mm256_storeu_si256(compressed + 4, w0);
w1 = _mm256_or_si256(w1,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 14) ) , 8));
w1 = _mm256_or_si256(w1,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 15) ) , 20));
_mm256_storeu_si256(compressed + 5, w1);
w0 = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 16) ) ;
w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 17) ) , 12));
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 18) ) ;
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 24));
w1 = _mm256_srli_epi32(tmp,8);
_mm256_storeu_si256(compressed + 6, w0);
w1 = _mm256_or_si256(w1,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 19) ) , 4));
w1 = _mm256_or_si256(w1,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 20) ) , 16));
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 21) ) ;
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 28));
w0 = _mm256_srli_epi32(tmp,4);
_mm256_storeu_si256(compressed + 7, w1);
w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 22) ) , 8));
w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 23) ) , 20));
_mm256_storeu_si256(compressed + 8, w0);
w1 = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 24) ) ;
w1 = _mm256_or_si256(w1,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 25) ) , 12));
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 26) ) ;
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 24));
w0 = _mm256_srli_epi32(tmp,8);
_mm256_storeu_si256(compressed + 9, w1);
w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 27) ) , 4));
w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 28) ) , 16));
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 29) ) ;
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 28));
w1 = _mm256_srli_epi32(tmp,4);
_mm256_storeu_si256(compressed + 10, w0);
w1 = _mm256_or_si256(w1,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 30) ) , 8));
w1 = _mm256_or_si256(w1,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 31) ) , 20));
_mm256_storeu_si256(compressed + 11, w1);
}
/* we are going to pack 256 13-bit values, touching 13 256-bit words, using 208 bytes */
static void avxpackblockmask13(const uint32_t * pin, __m256i * compressed) {
/* we are going to touch 13 256-bit words */
__m256i w0, w1;
const __m256i * in = (const __m256i *) pin;
const __m256i mask = _mm256_set1_epi32(8191);
__m256i tmp; /* used to store inputs at word boundary */
w0 = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 0) ) ;
w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 1) ) , 13));
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 2) ) ;
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 26));
w1 = _mm256_srli_epi32(tmp,6);
_mm256_storeu_si256(compressed + 0, w0);
w1 = _mm256_or_si256(w1,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 3) ) , 7));
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 4) ) ;
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 20));
w0 = _mm256_srli_epi32(tmp,12);
_mm256_storeu_si256(compressed + 1, w1);
w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 5) ) , 1));
w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 6) ) , 14));
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 7) ) ;
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 27));
w1 = _mm256_srli_epi32(tmp,5);
_mm256_storeu_si256(compressed + 2, w0);
w1 = _mm256_or_si256(w1,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 8) ) , 8));
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 9) ) ;
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 21));
w0 = _mm256_srli_epi32(tmp,11);
_mm256_storeu_si256(compressed + 3, w1);
w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 10) ) , 2));
w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 11) ) , 15));
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 12) ) ;
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 28));
w1 = _mm256_srli_epi32(tmp,4);
_mm256_storeu_si256(compressed + 4, w0);
w1 = _mm256_or_si256(w1,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 13) ) , 9));
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 14) ) ;
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 22));
w0 = _mm256_srli_epi32(tmp,10);
_mm256_storeu_si256(compressed + 5, w1);
w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 15) ) , 3));
w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 16) ) , 16));
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 17) ) ;
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 29));
w1 = _mm256_srli_epi32(tmp,3);
_mm256_storeu_si256(compressed + 6, w0);
w1 = _mm256_or_si256(w1,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 18) ) , 10));
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 19) ) ;
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 23));
w0 = _mm256_srli_epi32(tmp,9);
_mm256_storeu_si256(compressed + 7, w1);
w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 20) ) , 4));
w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 21) ) , 17));
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 22) ) ;
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 30));
w1 = _mm256_srli_epi32(tmp,2);
_mm256_storeu_si256(compressed + 8, w0);
w1 = _mm256_or_si256(w1,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 23) ) , 11));
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 24) ) ;
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 24));
w0 = _mm256_srli_epi32(tmp,8);
_mm256_storeu_si256(compressed + 9, w1);
w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 25) ) , 5));
w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 26) ) , 18));
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 27) ) ;
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 31));
w1 = _mm256_srli_epi32(tmp,1);
_mm256_storeu_si256(compressed + 10, w0);
w1 = _mm256_or_si256(w1,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 28) ) , 12));
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 29) ) ;
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 25));
w0 = _mm256_srli_epi32(tmp,7);
_mm256_storeu_si256(compressed + 11, w1);
w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 30) ) , 6));
w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 31) ) , 19));
_mm256_storeu_si256(compressed + 12, w0);
}
/* we are going to pack 256 14-bit values, touching 14 256-bit words, using 224 bytes */
static void avxpackblockmask14(const uint32_t * pin, __m256i * compressed) {
/* we are going to touch 14 256-bit words */
__m256i w0, w1;
const __m256i * in = (const __m256i *) pin;
const __m256i mask = _mm256_set1_epi32(16383);
__m256i tmp; /* used to store inputs at word boundary */
w0 = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 0) ) ;
w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 1) ) , 14));
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 2) ) ;
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 28));
w1 = _mm256_srli_epi32(tmp,4);
_mm256_storeu_si256(compressed + 0, w0);
w1 = _mm256_or_si256(w1,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 3) ) , 10));
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 4) ) ;
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 24));
w0 = _mm256_srli_epi32(tmp,8);
_mm256_storeu_si256(compressed + 1, w1);
w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 5) ) , 6));
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 6) ) ;
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 20));
w1 = _mm256_srli_epi32(tmp,12);
_mm256_storeu_si256(compressed + 2, w0);
w1 = _mm256_or_si256(w1,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 7) ) , 2));
w1 = _mm256_or_si256(w1,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 8) ) , 16));
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 9) ) ;
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 30));
w0 = _mm256_srli_epi32(tmp,2);
_mm256_storeu_si256(compressed + 3, w1);
w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 10) ) , 12));
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 11) ) ;
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 26));
w1 = _mm256_srli_epi32(tmp,6);
_mm256_storeu_si256(compressed + 4, w0);
w1 = _mm256_or_si256(w1,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 12) ) , 8));
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 13) ) ;
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 22));
w0 = _mm256_srli_epi32(tmp,10);
_mm256_storeu_si256(compressed + 5, w1);
w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 14) ) , 4));
w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 15) ) , 18));
_mm256_storeu_si256(compressed + 6, w0);
w1 = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 16) ) ;
w1 = _mm256_or_si256(w1,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 17) ) , 14));
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 18) ) ;
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 28));
w0 = _mm256_srli_epi32(tmp,4);
_mm256_storeu_si256(compressed + 7, w1);
w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 19) ) , 10));
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 20) ) ;
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 24));
w1 = _mm256_srli_epi32(tmp,8);
_mm256_storeu_si256(compressed + 8, w0);
w1 = _mm256_or_si256(w1,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 21) ) , 6));
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 22) ) ;
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 20));
w0 = _mm256_srli_epi32(tmp,12);
_mm256_storeu_si256(compressed + 9, w1);
w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 23) ) , 2));
w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 24) ) , 16));
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 25) ) ;
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 30));
w1 = _mm256_srli_epi32(tmp,2);
_mm256_storeu_si256(compressed + 10, w0);
w1 = _mm256_or_si256(w1,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 26) ) , 12));
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 27) ) ;
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 26));
w0 = _mm256_srli_epi32(tmp,6);
_mm256_storeu_si256(compressed + 11, w1);
w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 28) ) , 8));
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 29) ) ;
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 22));
w1 = _mm256_srli_epi32(tmp,10);
_mm256_storeu_si256(compressed + 12, w0);
w1 = _mm256_or_si256(w1,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 30) ) , 4));
w1 = _mm256_or_si256(w1,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 31) ) , 18));
_mm256_storeu_si256(compressed + 13, w1);
}
/* we are going to pack 256 15-bit values, touching 15 256-bit words, using 240 bytes */
static void avxpackblockmask15(const uint32_t * pin, __m256i * compressed) {
/* we are going to touch 15 256-bit words */
__m256i w0, w1;
const __m256i * in = (const __m256i *) pin;
const __m256i mask = _mm256_set1_epi32(32767);
__m256i tmp; /* used to store inputs at word boundary */
w0 = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 0) ) ;
w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 1) ) , 15));
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 2) ) ;
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 30));
w1 = _mm256_srli_epi32(tmp,2);
_mm256_storeu_si256(compressed + 0, w0);
w1 = _mm256_or_si256(w1,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 3) ) , 13));
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 4) ) ;
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 28));
w0 = _mm256_srli_epi32(tmp,4);
_mm256_storeu_si256(compressed + 1, w1);
w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 5) ) , 11));
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 6) ) ;
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 26));
w1 = _mm256_srli_epi32(tmp,6);
_mm256_storeu_si256(compressed + 2, w0);
w1 = _mm256_or_si256(w1,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 7) ) , 9));
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 8) ) ;
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 24));
w0 = _mm256_srli_epi32(tmp,8);
_mm256_storeu_si256(compressed + 3, w1);
w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 9) ) , 7));
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 10) ) ;
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 22));
w1 = _mm256_srli_epi32(tmp,10);
_mm256_storeu_si256(compressed + 4, w0);
w1 = _mm256_or_si256(w1,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 11) ) , 5));
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 12) ) ;
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 20));
w0 = _mm256_srli_epi32(tmp,12);
_mm256_storeu_si256(compressed + 5, w1);
w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 13) ) , 3));
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 14) ) ;
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 18));
w1 = _mm256_srli_epi32(tmp,14);
_mm256_storeu_si256(compressed + 6, w0);
w1 = _mm256_or_si256(w1,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 15) ) , 1));
w1 = _mm256_or_si256(w1,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 16) ) , 16));
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 17) ) ;
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 31));
w0 = _mm256_srli_epi32(tmp,1);
_mm256_storeu_si256(compressed + 7, w1);
w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 18) ) , 14));
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 19) ) ;
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 29));
w1 = _mm256_srli_epi32(tmp,3);
_mm256_storeu_si256(compressed + 8, w0);
w1 = _mm256_or_si256(w1,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 20) ) , 12));
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 21) ) ;
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 27));
w0 = _mm256_srli_epi32(tmp,5);
_mm256_storeu_si256(compressed + 9, w1);
w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 22) ) , 10));
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 23) ) ;
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 25));
w1 = _mm256_srli_epi32(tmp,7);
_mm256_storeu_si256(compressed + 10, w0);
w1 = _mm256_or_si256(w1,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 24) ) , 8));
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 25) ) ;
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 23));
w0 = _mm256_srli_epi32(tmp,9);
_mm256_storeu_si256(compressed + 11, w1);
w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 26) ) , 6));
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 27) ) ;
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 21));
w1 = _mm256_srli_epi32(tmp,11);
_mm256_storeu_si256(compressed + 12, w0);
w1 = _mm256_or_si256(w1,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 28) ) , 4));
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 29) ) ;
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 19));
w0 = _mm256_srli_epi32(tmp,13);
_mm256_storeu_si256(compressed + 13, w1);
w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 30) ) , 2));
w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 31) ) , 17));
_mm256_storeu_si256(compressed + 14, w0);
}
/* we are going to pack 256 16-bit values, touching 16 256-bit words, using 256 bytes */
static void avxpackblockmask16(const uint32_t * pin, __m256i * compressed) {
/* we are going to touch 16 256-bit words */
__m256i w0, w1;
const __m256i * in = (const __m256i *) pin;
const __m256i mask = _mm256_set1_epi32(65535);
w0 = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 0) ) ;
w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 1) ) , 16));
_mm256_storeu_si256(compressed + 0, w0);
w1 = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 2) ) ;
w1 = _mm256_or_si256(w1,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 3) ) , 16));
_mm256_storeu_si256(compressed + 1, w1);
w0 = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 4) ) ;
w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 5) ) , 16));
_mm256_storeu_si256(compressed + 2, w0);
w1 = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 6) ) ;
w1 = _mm256_or_si256(w1,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 7) ) , 16));
_mm256_storeu_si256(compressed + 3, w1);
w0 = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 8) ) ;
w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 9) ) , 16));
_mm256_storeu_si256(compressed + 4, w0);
w1 = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 10) ) ;
w1 = _mm256_or_si256(w1,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 11) ) , 16));
_mm256_storeu_si256(compressed + 5, w1);
w0 = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 12) ) ;
w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 13) ) , 16));
_mm256_storeu_si256(compressed + 6, w0);
w1 = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 14) ) ;
w1 = _mm256_or_si256(w1,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 15) ) , 16));
_mm256_storeu_si256(compressed + 7, w1);
w0 = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 16) ) ;
w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 17) ) , 16));
_mm256_storeu_si256(compressed + 8, w0);
w1 = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 18) ) ;
w1 = _mm256_or_si256(w1,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 19) ) , 16));
_mm256_storeu_si256(compressed + 9, w1);
w0 = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 20) ) ;
w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 21) ) , 16));
_mm256_storeu_si256(compressed + 10, w0);
w1 = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 22) ) ;
w1 = _mm256_or_si256(w1,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 23) ) , 16));
_mm256_storeu_si256(compressed + 11, w1);
w0 = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 24) ) ;
w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 25) ) , 16));
_mm256_storeu_si256(compressed + 12, w0);
w1 = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 26) ) ;
w1 = _mm256_or_si256(w1,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 27) ) , 16));
_mm256_storeu_si256(compressed + 13, w1);
w0 = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 28) ) ;
w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 29) ) , 16));
_mm256_storeu_si256(compressed + 14, w0);
w1 = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 30) ) ;
w1 = _mm256_or_si256(w1,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 31) ) , 16));
_mm256_storeu_si256(compressed + 15, w1);
}
/* we are going to pack 256 17-bit values, touching 17 256-bit words, using 272 bytes */
static void avxpackblockmask17(const uint32_t * pin, __m256i * compressed) {
/* we are going to touch 17 256-bit words */
__m256i w0, w1;
const __m256i * in = (const __m256i *) pin;
const __m256i mask = _mm256_set1_epi32(131071);
__m256i tmp; /* used to store inputs at word boundary */
w0 = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 0) ) ;
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 1) ) ;
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 17));
w1 = _mm256_srli_epi32(tmp,15);
_mm256_storeu_si256(compressed + 0, w0);
w1 = _mm256_or_si256(w1,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 2) ) , 2));
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 3) ) ;
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 19));
w0 = _mm256_srli_epi32(tmp,13);
_mm256_storeu_si256(compressed + 1, w1);
w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 4) ) , 4));
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 5) ) ;
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 21));
w1 = _mm256_srli_epi32(tmp,11);
_mm256_storeu_si256(compressed + 2, w0);
w1 = _mm256_or_si256(w1,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 6) ) , 6));
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 7) ) ;
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 23));
w0 = _mm256_srli_epi32(tmp,9);
_mm256_storeu_si256(compressed + 3, w1);
w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 8) ) , 8));
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 9) ) ;
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 25));
w1 = _mm256_srli_epi32(tmp,7);
_mm256_storeu_si256(compressed + 4, w0);
w1 = _mm256_or_si256(w1,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 10) ) , 10));
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 11) ) ;
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 27));
w0 = _mm256_srli_epi32(tmp,5);
_mm256_storeu_si256(compressed + 5, w1);
w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 12) ) , 12));
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 13) ) ;
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 29));
w1 = _mm256_srli_epi32(tmp,3);
_mm256_storeu_si256(compressed + 6, w0);
w1 = _mm256_or_si256(w1,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 14) ) , 14));
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 15) ) ;
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 31));
w0 = _mm256_srli_epi32(tmp,1);
_mm256_storeu_si256(compressed + 7, w1);
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 16) ) ;
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 16));
w1 = _mm256_srli_epi32(tmp,16);
_mm256_storeu_si256(compressed + 8, w0);
w1 = _mm256_or_si256(w1,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 17) ) , 1));
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 18) ) ;
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 18));
w0 = _mm256_srli_epi32(tmp,14);
_mm256_storeu_si256(compressed + 9, w1);
w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 19) ) , 3));
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 20) ) ;
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 20));
w1 = _mm256_srli_epi32(tmp,12);
_mm256_storeu_si256(compressed + 10, w0);
w1 = _mm256_or_si256(w1,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 21) ) , 5));
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 22) ) ;
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 22));
w0 = _mm256_srli_epi32(tmp,10);
_mm256_storeu_si256(compressed + 11, w1);
w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 23) ) , 7));
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 24) ) ;
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 24));
w1 = _mm256_srli_epi32(tmp,8);
_mm256_storeu_si256(compressed + 12, w0);
w1 = _mm256_or_si256(w1,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 25) ) , 9));
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 26) ) ;
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 26));
w0 = _mm256_srli_epi32(tmp,6);
_mm256_storeu_si256(compressed + 13, w1);
w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 27) ) , 11));
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 28) ) ;
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 28));
w1 = _mm256_srli_epi32(tmp,4);
_mm256_storeu_si256(compressed + 14, w0);
w1 = _mm256_or_si256(w1,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 29) ) , 13));
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 30) ) ;
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 30));
w0 = _mm256_srli_epi32(tmp,2);
_mm256_storeu_si256(compressed + 15, w1);
w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 31) ) , 15));
_mm256_storeu_si256(compressed + 16, w0);
}
/* we are going to pack 256 18-bit values, touching 18 256-bit words, using 288 bytes */
static void avxpackblockmask18(const uint32_t * pin, __m256i * compressed) {
/* we are going to touch 18 256-bit words */
__m256i w0, w1;
const __m256i * in = (const __m256i *) pin;
const __m256i mask = _mm256_set1_epi32(262143);
__m256i tmp; /* used to store inputs at word boundary */
w0 = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 0) ) ;
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 1) ) ;
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 18));
w1 = _mm256_srli_epi32(tmp,14);
_mm256_storeu_si256(compressed + 0, w0);
w1 = _mm256_or_si256(w1,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 2) ) , 4));
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 3) ) ;
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 22));
w0 = _mm256_srli_epi32(tmp,10);
_mm256_storeu_si256(compressed + 1, w1);
w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 4) ) , 8));
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 5) ) ;
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 26));
w1 = _mm256_srli_epi32(tmp,6);
_mm256_storeu_si256(compressed + 2, w0);
w1 = _mm256_or_si256(w1,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 6) ) , 12));
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 7) ) ;
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 30));
w0 = _mm256_srli_epi32(tmp,2);
_mm256_storeu_si256(compressed + 3, w1);
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 8) ) ;
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 16));
w1 = _mm256_srli_epi32(tmp,16);
_mm256_storeu_si256(compressed + 4, w0);
w1 = _mm256_or_si256(w1,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 9) ) , 2));
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 10) ) ;
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 20));
w0 = _mm256_srli_epi32(tmp,12);
_mm256_storeu_si256(compressed + 5, w1);
w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 11) ) , 6));
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 12) ) ;
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 24));
w1 = _mm256_srli_epi32(tmp,8);
_mm256_storeu_si256(compressed + 6, w0);
w1 = _mm256_or_si256(w1,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 13) ) , 10));
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 14) ) ;
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 28));
w0 = _mm256_srli_epi32(tmp,4);
_mm256_storeu_si256(compressed + 7, w1);
w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 15) ) , 14));
_mm256_storeu_si256(compressed + 8, w0);
w1 = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 16) ) ;
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 17) ) ;
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 18));
w0 = _mm256_srli_epi32(tmp,14);
_mm256_storeu_si256(compressed + 9, w1);
w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 18) ) , 4));
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 19) ) ;
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 22));
w1 = _mm256_srli_epi32(tmp,10);
_mm256_storeu_si256(compressed + 10, w0);
w1 = _mm256_or_si256(w1,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 20) ) , 8));
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 21) ) ;
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 26));
w0 = _mm256_srli_epi32(tmp,6);
_mm256_storeu_si256(compressed + 11, w1);
w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 22) ) , 12));
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 23) ) ;
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 30));
w1 = _mm256_srli_epi32(tmp,2);
_mm256_storeu_si256(compressed + 12, w0);
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 24) ) ;
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 16));
w0 = _mm256_srli_epi32(tmp,16);
_mm256_storeu_si256(compressed + 13, w1);
w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 25) ) , 2));
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 26) ) ;
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 20));
w1 = _mm256_srli_epi32(tmp,12);
_mm256_storeu_si256(compressed + 14, w0);
w1 = _mm256_or_si256(w1,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 27) ) , 6));
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 28) ) ;
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 24));
w0 = _mm256_srli_epi32(tmp,8);
_mm256_storeu_si256(compressed + 15, w1);
w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 29) ) , 10));
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 30) ) ;
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 28));
w1 = _mm256_srli_epi32(tmp,4);
_mm256_storeu_si256(compressed + 16, w0);
w1 = _mm256_or_si256(w1,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 31) ) , 14));
_mm256_storeu_si256(compressed + 17, w1);
}
/* we are going to pack 256 19-bit values, touching 19 256-bit words, using 304 bytes */
static void avxpackblockmask19(const uint32_t * pin, __m256i * compressed) {
/* we are going to touch 19 256-bit words */
__m256i w0, w1;
const __m256i * in = (const __m256i *) pin;
const __m256i mask = _mm256_set1_epi32(524287);
__m256i tmp; /* used to store inputs at word boundary */
w0 = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 0) ) ;
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 1) ) ;
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 19));
w1 = _mm256_srli_epi32(tmp,13);
_mm256_storeu_si256(compressed + 0, w0);
w1 = _mm256_or_si256(w1,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 2) ) , 6));
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 3) ) ;
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 25));
w0 = _mm256_srli_epi32(tmp,7);
_mm256_storeu_si256(compressed + 1, w1);
w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 4) ) , 12));
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 5) ) ;
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 31));
w1 = _mm256_srli_epi32(tmp,1);
_mm256_storeu_si256(compressed + 2, w0);
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 6) ) ;
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 18));
w0 = _mm256_srli_epi32(tmp,14);
_mm256_storeu_si256(compressed + 3, w1);
w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 7) ) , 5));
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 8) ) ;
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 24));
w1 = _mm256_srli_epi32(tmp,8);
_mm256_storeu_si256(compressed + 4, w0);
w1 = _mm256_or_si256(w1,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 9) ) , 11));
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 10) ) ;
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 30));
w0 = _mm256_srli_epi32(tmp,2);
_mm256_storeu_si256(compressed + 5, w1);
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 11) ) ;
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 17));
w1 = _mm256_srli_epi32(tmp,15);
_mm256_storeu_si256(compressed + 6, w0);
w1 = _mm256_or_si256(w1,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 12) ) , 4));
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 13) ) ;
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 23));
w0 = _mm256_srli_epi32(tmp,9);
_mm256_storeu_si256(compressed + 7, w1);
w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 14) ) , 10));
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 15) ) ;
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 29));
w1 = _mm256_srli_epi32(tmp,3);
_mm256_storeu_si256(compressed + 8, w0);
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 16) ) ;
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 16));
w0 = _mm256_srli_epi32(tmp,16);
_mm256_storeu_si256(compressed + 9, w1);
w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 17) ) , 3));
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 18) ) ;
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 22));
w1 = _mm256_srli_epi32(tmp,10);
_mm256_storeu_si256(compressed + 10, w0);
w1 = _mm256_or_si256(w1,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 19) ) , 9));
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 20) ) ;
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 28));
w0 = _mm256_srli_epi32(tmp,4);
_mm256_storeu_si256(compressed + 11, w1);
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 21) ) ;
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 15));
w1 = _mm256_srli_epi32(tmp,17);
_mm256_storeu_si256(compressed + 12, w0);
w1 = _mm256_or_si256(w1,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 22) ) , 2));
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 23) ) ;
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 21));
w0 = _mm256_srli_epi32(tmp,11);
_mm256_storeu_si256(compressed + 13, w1);
w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 24) ) , 8));
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 25) ) ;
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 27));
w1 = _mm256_srli_epi32(tmp,5);
_mm256_storeu_si256(compressed + 14, w0);
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 26) ) ;
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 14));
w0 = _mm256_srli_epi32(tmp,18);
_mm256_storeu_si256(compressed + 15, w1);
w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 27) ) , 1));
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 28) ) ;
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 20));
w1 = _mm256_srli_epi32(tmp,12);
_mm256_storeu_si256(compressed + 16, w0);
w1 = _mm256_or_si256(w1,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 29) ) , 7));
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 30) ) ;
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 26));
w0 = _mm256_srli_epi32(tmp,6);
_mm256_storeu_si256(compressed + 17, w1);
w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 31) ) , 13));
_mm256_storeu_si256(compressed + 18, w0);
}
/* we are going to pack 256 20-bit values, touching 20 256-bit words, using 320 bytes */
static void avxpackblockmask20(const uint32_t * pin, __m256i * compressed) {
/* we are going to touch 20 256-bit words */
__m256i w0, w1;
const __m256i * in = (const __m256i *) pin;
const __m256i mask = _mm256_set1_epi32(1048575);
__m256i tmp; /* used to store inputs at word boundary */
w0 = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 0) ) ;
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 1) ) ;
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 20));
w1 = _mm256_srli_epi32(tmp,12);
_mm256_storeu_si256(compressed + 0, w0);
w1 = _mm256_or_si256(w1,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 2) ) , 8));
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 3) ) ;
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 28));
w0 = _mm256_srli_epi32(tmp,4);
_mm256_storeu_si256(compressed + 1, w1);
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 4) ) ;
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 16));
w1 = _mm256_srli_epi32(tmp,16);
_mm256_storeu_si256(compressed + 2, w0);
w1 = _mm256_or_si256(w1,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 5) ) , 4));
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 6) ) ;
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 24));
w0 = _mm256_srli_epi32(tmp,8);
_mm256_storeu_si256(compressed + 3, w1);
w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 7) ) , 12));
_mm256_storeu_si256(compressed + 4, w0);
w1 = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 8) ) ;
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 9) ) ;
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 20));
w0 = _mm256_srli_epi32(tmp,12);
_mm256_storeu_si256(compressed + 5, w1);
w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 10) ) , 8));
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 11) ) ;
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 28));
w1 = _mm256_srli_epi32(tmp,4);
_mm256_storeu_si256(compressed + 6, w0);
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 12) ) ;
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 16));
w0 = _mm256_srli_epi32(tmp,16);
_mm256_storeu_si256(compressed + 7, w1);
w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 13) ) , 4));
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 14) ) ;
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 24));
w1 = _mm256_srli_epi32(tmp,8);
_mm256_storeu_si256(compressed + 8, w0);
w1 = _mm256_or_si256(w1,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 15) ) , 12));
_mm256_storeu_si256(compressed + 9, w1);
w0 = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 16) ) ;
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 17) ) ;
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 20));
w1 = _mm256_srli_epi32(tmp,12);
_mm256_storeu_si256(compressed + 10, w0);
w1 = _mm256_or_si256(w1,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 18) ) , 8));
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 19) ) ;
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 28));
w0 = _mm256_srli_epi32(tmp,4);
_mm256_storeu_si256(compressed + 11, w1);
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 20) ) ;
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 16));
w1 = _mm256_srli_epi32(tmp,16);
_mm256_storeu_si256(compressed + 12, w0);
w1 = _mm256_or_si256(w1,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 21) ) , 4));
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 22) ) ;
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 24));
w0 = _mm256_srli_epi32(tmp,8);
_mm256_storeu_si256(compressed + 13, w1);
w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 23) ) , 12));
_mm256_storeu_si256(compressed + 14, w0);
w1 = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 24) ) ;
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 25) ) ;
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 20));
w0 = _mm256_srli_epi32(tmp,12);
_mm256_storeu_si256(compressed + 15, w1);
w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 26) ) , 8));
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 27) ) ;
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 28));
w1 = _mm256_srli_epi32(tmp,4);
_mm256_storeu_si256(compressed + 16, w0);
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 28) ) ;
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 16));
w0 = _mm256_srli_epi32(tmp,16);
_mm256_storeu_si256(compressed + 17, w1);
w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 29) ) , 4));
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 30) ) ;
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 24));
w1 = _mm256_srli_epi32(tmp,8);
_mm256_storeu_si256(compressed + 18, w0);
w1 = _mm256_or_si256(w1,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 31) ) , 12));
_mm256_storeu_si256(compressed + 19, w1);
}
/* we are going to pack 256 21-bit values, touching 21 256-bit words, using 336 bytes */
static void avxpackblockmask21(const uint32_t * pin, __m256i * compressed) {
/* we are going to touch 21 256-bit words */
__m256i w0, w1;
const __m256i * in = (const __m256i *) pin;
const __m256i mask = _mm256_set1_epi32(2097151);
__m256i tmp; /* used to store inputs at word boundary */
w0 = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 0) ) ;
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 1) ) ;
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 21));
w1 = _mm256_srli_epi32(tmp,11);
_mm256_storeu_si256(compressed + 0, w0);
w1 = _mm256_or_si256(w1,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 2) ) , 10));
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 3) ) ;
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 31));
w0 = _mm256_srli_epi32(tmp,1);
_mm256_storeu_si256(compressed + 1, w1);
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 4) ) ;
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 20));
w1 = _mm256_srli_epi32(tmp,12);
_mm256_storeu_si256(compressed + 2, w0);
w1 = _mm256_or_si256(w1,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 5) ) , 9));
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 6) ) ;
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 30));
w0 = _mm256_srli_epi32(tmp,2);
_mm256_storeu_si256(compressed + 3, w1);
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 7) ) ;
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 19));
w1 = _mm256_srli_epi32(tmp,13);
_mm256_storeu_si256(compressed + 4, w0);
w1 = _mm256_or_si256(w1,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 8) ) , 8));
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 9) ) ;
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 29));
w0 = _mm256_srli_epi32(tmp,3);
_mm256_storeu_si256(compressed + 5, w1);
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 10) ) ;
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 18));
w1 = _mm256_srli_epi32(tmp,14);
_mm256_storeu_si256(compressed + 6, w0);
w1 = _mm256_or_si256(w1,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 11) ) , 7));
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 12) ) ;
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 28));
w0 = _mm256_srli_epi32(tmp,4);
_mm256_storeu_si256(compressed + 7, w1);
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 13) ) ;
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 17));
w1 = _mm256_srli_epi32(tmp,15);
_mm256_storeu_si256(compressed + 8, w0);
w1 = _mm256_or_si256(w1,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 14) ) , 6));
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 15) ) ;
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 27));
w0 = _mm256_srli_epi32(tmp,5);
_mm256_storeu_si256(compressed + 9, w1);
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 16) ) ;
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 16));
w1 = _mm256_srli_epi32(tmp,16);
_mm256_storeu_si256(compressed + 10, w0);
w1 = _mm256_or_si256(w1,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 17) ) , 5));
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 18) ) ;
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 26));
w0 = _mm256_srli_epi32(tmp,6);
_mm256_storeu_si256(compressed + 11, w1);
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 19) ) ;
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 15));
w1 = _mm256_srli_epi32(tmp,17);
_mm256_storeu_si256(compressed + 12, w0);
w1 = _mm256_or_si256(w1,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 20) ) , 4));
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 21) ) ;
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 25));
w0 = _mm256_srli_epi32(tmp,7);
_mm256_storeu_si256(compressed + 13, w1);
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 22) ) ;
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 14));
w1 = _mm256_srli_epi32(tmp,18);
_mm256_storeu_si256(compressed + 14, w0);
w1 = _mm256_or_si256(w1,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 23) ) , 3));
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 24) ) ;
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 24));
w0 = _mm256_srli_epi32(tmp,8);
_mm256_storeu_si256(compressed + 15, w1);
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 25) ) ;
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 13));
w1 = _mm256_srli_epi32(tmp,19);
_mm256_storeu_si256(compressed + 16, w0);
w1 = _mm256_or_si256(w1,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 26) ) , 2));
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 27) ) ;
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 23));
w0 = _mm256_srli_epi32(tmp,9);
_mm256_storeu_si256(compressed + 17, w1);
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 28) ) ;
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 12));
w1 = _mm256_srli_epi32(tmp,20);
_mm256_storeu_si256(compressed + 18, w0);
w1 = _mm256_or_si256(w1,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 29) ) , 1));
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 30) ) ;
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 22));
w0 = _mm256_srli_epi32(tmp,10);
_mm256_storeu_si256(compressed + 19, w1);
w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 31) ) , 11));
_mm256_storeu_si256(compressed + 20, w0);
}
/* we are going to pack 256 22-bit values, touching 22 256-bit words, using 352 bytes */
static void avxpackblockmask22(const uint32_t * pin, __m256i * compressed) {
/* we are going to touch 22 256-bit words */
__m256i w0, w1;
const __m256i * in = (const __m256i *) pin;
const __m256i mask = _mm256_set1_epi32(4194303);
__m256i tmp; /* used to store inputs at word boundary */
w0 = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 0) ) ;
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 1) ) ;
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 22));
w1 = _mm256_srli_epi32(tmp,10);
_mm256_storeu_si256(compressed + 0, w0);
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 2) ) ;
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 12));
w0 = _mm256_srli_epi32(tmp,20);
_mm256_storeu_si256(compressed + 1, w1);
w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 3) ) , 2));
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 4) ) ;
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 24));
w1 = _mm256_srli_epi32(tmp,8);
_mm256_storeu_si256(compressed + 2, w0);
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 5) ) ;
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 14));
w0 = _mm256_srli_epi32(tmp,18);
_mm256_storeu_si256(compressed + 3, w1);
w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 6) ) , 4));
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 7) ) ;
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 26));
w1 = _mm256_srli_epi32(tmp,6);
_mm256_storeu_si256(compressed + 4, w0);
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 8) ) ;
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 16));
w0 = _mm256_srli_epi32(tmp,16);
_mm256_storeu_si256(compressed + 5, w1);
w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 9) ) , 6));
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 10) ) ;
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 28));
w1 = _mm256_srli_epi32(tmp,4);
_mm256_storeu_si256(compressed + 6, w0);
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 11) ) ;
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 18));
w0 = _mm256_srli_epi32(tmp,14);
_mm256_storeu_si256(compressed + 7, w1);
w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 12) ) , 8));
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 13) ) ;
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 30));
w1 = _mm256_srli_epi32(tmp,2);
_mm256_storeu_si256(compressed + 8, w0);
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 14) ) ;
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 20));
w0 = _mm256_srli_epi32(tmp,12);
_mm256_storeu_si256(compressed + 9, w1);
w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 15) ) , 10));
_mm256_storeu_si256(compressed + 10, w0);
w1 = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 16) ) ;
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 17) ) ;
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 22));
w0 = _mm256_srli_epi32(tmp,10);
_mm256_storeu_si256(compressed + 11, w1);
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 18) ) ;
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 12));
w1 = _mm256_srli_epi32(tmp,20);
_mm256_storeu_si256(compressed + 12, w0);
w1 = _mm256_or_si256(w1,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 19) ) , 2));
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 20) ) ;
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 24));
w0 = _mm256_srli_epi32(tmp,8);
_mm256_storeu_si256(compressed + 13, w1);
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 21) ) ;
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 14));
w1 = _mm256_srli_epi32(tmp,18);
_mm256_storeu_si256(compressed + 14, w0);
w1 = _mm256_or_si256(w1,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 22) ) , 4));
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 23) ) ;
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 26));
w0 = _mm256_srli_epi32(tmp,6);
_mm256_storeu_si256(compressed + 15, w1);
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 24) ) ;
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 16));
w1 = _mm256_srli_epi32(tmp,16);
_mm256_storeu_si256(compressed + 16, w0);
w1 = _mm256_or_si256(w1,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 25) ) , 6));
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 26) ) ;
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 28));
w0 = _mm256_srli_epi32(tmp,4);
_mm256_storeu_si256(compressed + 17, w1);
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 27) ) ;
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 18));
w1 = _mm256_srli_epi32(tmp,14);
_mm256_storeu_si256(compressed + 18, w0);
w1 = _mm256_or_si256(w1,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 28) ) , 8));
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 29) ) ;
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 30));
w0 = _mm256_srli_epi32(tmp,2);
_mm256_storeu_si256(compressed + 19, w1);
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 30) ) ;
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 20));
w1 = _mm256_srli_epi32(tmp,12);
_mm256_storeu_si256(compressed + 20, w0);
w1 = _mm256_or_si256(w1,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 31) ) , 10));
_mm256_storeu_si256(compressed + 21, w1);
}
/* we are going to pack 256 23-bit values, touching 23 256-bit words, using 368 bytes */
static void avxpackblockmask23(const uint32_t * pin, __m256i * compressed) {
/* we are going to touch 23 256-bit words */
__m256i w0, w1;
const __m256i * in = (const __m256i *) pin;
const __m256i mask = _mm256_set1_epi32(8388607);
__m256i tmp; /* used to store inputs at word boundary */
w0 = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 0) ) ;
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 1) ) ;
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 23));
w1 = _mm256_srli_epi32(tmp,9);
_mm256_storeu_si256(compressed + 0, w0);
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 2) ) ;
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 14));
w0 = _mm256_srli_epi32(tmp,18);
_mm256_storeu_si256(compressed + 1, w1);
w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 3) ) , 5));
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 4) ) ;
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 28));
w1 = _mm256_srli_epi32(tmp,4);
_mm256_storeu_si256(compressed + 2, w0);
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 5) ) ;
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 19));
w0 = _mm256_srli_epi32(tmp,13);
_mm256_storeu_si256(compressed + 3, w1);
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 6) ) ;
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 10));
w1 = _mm256_srli_epi32(tmp,22);
_mm256_storeu_si256(compressed + 4, w0);
w1 = _mm256_or_si256(w1,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 7) ) , 1));
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 8) ) ;
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 24));
w0 = _mm256_srli_epi32(tmp,8);
_mm256_storeu_si256(compressed + 5, w1);
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 9) ) ;
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 15));
w1 = _mm256_srli_epi32(tmp,17);
_mm256_storeu_si256(compressed + 6, w0);
w1 = _mm256_or_si256(w1,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 10) ) , 6));
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 11) ) ;
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 29));
w0 = _mm256_srli_epi32(tmp,3);
_mm256_storeu_si256(compressed + 7, w1);
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 12) ) ;
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 20));
w1 = _mm256_srli_epi32(tmp,12);
_mm256_storeu_si256(compressed + 8, w0);
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 13) ) ;
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 11));
w0 = _mm256_srli_epi32(tmp,21);
_mm256_storeu_si256(compressed + 9, w1);
w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 14) ) , 2));
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 15) ) ;
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 25));
w1 = _mm256_srli_epi32(tmp,7);
_mm256_storeu_si256(compressed + 10, w0);
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 16) ) ;
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 16));
w0 = _mm256_srli_epi32(tmp,16);
_mm256_storeu_si256(compressed + 11, w1);
w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 17) ) , 7));
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 18) ) ;
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 30));
w1 = _mm256_srli_epi32(tmp,2);
_mm256_storeu_si256(compressed + 12, w0);
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 19) ) ;
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 21));
w0 = _mm256_srli_epi32(tmp,11);
_mm256_storeu_si256(compressed + 13, w1);
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 20) ) ;
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 12));
w1 = _mm256_srli_epi32(tmp,20);
_mm256_storeu_si256(compressed + 14, w0);
w1 = _mm256_or_si256(w1,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 21) ) , 3));
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 22) ) ;
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 26));
w0 = _mm256_srli_epi32(tmp,6);
_mm256_storeu_si256(compressed + 15, w1);
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 23) ) ;
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 17));
w1 = _mm256_srli_epi32(tmp,15);
_mm256_storeu_si256(compressed + 16, w0);
w1 = _mm256_or_si256(w1,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 24) ) , 8));
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 25) ) ;
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 31));
w0 = _mm256_srli_epi32(tmp,1);
_mm256_storeu_si256(compressed + 17, w1);
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 26) ) ;
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 22));
w1 = _mm256_srli_epi32(tmp,10);
_mm256_storeu_si256(compressed + 18, w0);
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 27) ) ;
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 13));
w0 = _mm256_srli_epi32(tmp,19);
_mm256_storeu_si256(compressed + 19, w1);
w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 28) ) , 4));
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 29) ) ;
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 27));
w1 = _mm256_srli_epi32(tmp,5);
_mm256_storeu_si256(compressed + 20, w0);
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 30) ) ;
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 18));
w0 = _mm256_srli_epi32(tmp,14);
_mm256_storeu_si256(compressed + 21, w1);
w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 31) ) , 9));
_mm256_storeu_si256(compressed + 22, w0);
}
/* we are going to pack 256 24-bit values, touching 24 256-bit words, using 384 bytes */
static void avxpackblockmask24(const uint32_t * pin, __m256i * compressed) {
/* we are going to touch 24 256-bit words */
__m256i w0, w1;
const __m256i * in = (const __m256i *) pin;
const __m256i mask = _mm256_set1_epi32(16777215);
__m256i tmp; /* used to store inputs at word boundary */
w0 = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 0) ) ;
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 1) ) ;
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 24));
w1 = _mm256_srli_epi32(tmp,8);
_mm256_storeu_si256(compressed + 0, w0);
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 2) ) ;
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 16));
w0 = _mm256_srli_epi32(tmp,16);
_mm256_storeu_si256(compressed + 1, w1);
w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 3) ) , 8));
_mm256_storeu_si256(compressed + 2, w0);
w1 = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 4) ) ;
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 5) ) ;
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 24));
w0 = _mm256_srli_epi32(tmp,8);
_mm256_storeu_si256(compressed + 3, w1);
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 6) ) ;
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 16));
w1 = _mm256_srli_epi32(tmp,16);
_mm256_storeu_si256(compressed + 4, w0);
w1 = _mm256_or_si256(w1,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 7) ) , 8));
_mm256_storeu_si256(compressed + 5, w1);
w0 = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 8) ) ;
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 9) ) ;
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 24));
w1 = _mm256_srli_epi32(tmp,8);
_mm256_storeu_si256(compressed + 6, w0);
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 10) ) ;
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 16));
w0 = _mm256_srli_epi32(tmp,16);
_mm256_storeu_si256(compressed + 7, w1);
w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 11) ) , 8));
_mm256_storeu_si256(compressed + 8, w0);
w1 = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 12) ) ;
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 13) ) ;
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 24));
w0 = _mm256_srli_epi32(tmp,8);
_mm256_storeu_si256(compressed + 9, w1);
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 14) ) ;
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 16));
w1 = _mm256_srli_epi32(tmp,16);
_mm256_storeu_si256(compressed + 10, w0);
w1 = _mm256_or_si256(w1,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 15) ) , 8));
_mm256_storeu_si256(compressed + 11, w1);
w0 = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 16) ) ;
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 17) ) ;
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 24));
w1 = _mm256_srli_epi32(tmp,8);
_mm256_storeu_si256(compressed + 12, w0);
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 18) ) ;
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 16));
w0 = _mm256_srli_epi32(tmp,16);
_mm256_storeu_si256(compressed + 13, w1);
w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 19) ) , 8));
_mm256_storeu_si256(compressed + 14, w0);
w1 = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 20) ) ;
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 21) ) ;
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 24));
w0 = _mm256_srli_epi32(tmp,8);
_mm256_storeu_si256(compressed + 15, w1);
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 22) ) ;
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 16));
w1 = _mm256_srli_epi32(tmp,16);
_mm256_storeu_si256(compressed + 16, w0);
w1 = _mm256_or_si256(w1,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 23) ) , 8));
_mm256_storeu_si256(compressed + 17, w1);
w0 = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 24) ) ;
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 25) ) ;
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 24));
w1 = _mm256_srli_epi32(tmp,8);
_mm256_storeu_si256(compressed + 18, w0);
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 26) ) ;
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 16));
w0 = _mm256_srli_epi32(tmp,16);
_mm256_storeu_si256(compressed + 19, w1);
w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 27) ) , 8));
_mm256_storeu_si256(compressed + 20, w0);
w1 = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 28) ) ;
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 29) ) ;
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 24));
w0 = _mm256_srli_epi32(tmp,8);
_mm256_storeu_si256(compressed + 21, w1);
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 30) ) ;
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 16));
w1 = _mm256_srli_epi32(tmp,16);
_mm256_storeu_si256(compressed + 22, w0);
w1 = _mm256_or_si256(w1,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 31) ) , 8));
_mm256_storeu_si256(compressed + 23, w1);
}
/* we are going to pack 256 25-bit values, touching 25 256-bit words, using 400 bytes */
static void avxpackblockmask25(const uint32_t * pin, __m256i * compressed) {
/* we are going to touch 25 256-bit words */
__m256i w0, w1;
const __m256i * in = (const __m256i *) pin;
const __m256i mask = _mm256_set1_epi32(33554431);
__m256i tmp; /* used to store inputs at word boundary */
w0 = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 0) ) ;
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 1) ) ;
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 25));
w1 = _mm256_srli_epi32(tmp,7);
_mm256_storeu_si256(compressed + 0, w0);
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 2) ) ;
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 18));
w0 = _mm256_srli_epi32(tmp,14);
_mm256_storeu_si256(compressed + 1, w1);
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 3) ) ;
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 11));
w1 = _mm256_srli_epi32(tmp,21);
_mm256_storeu_si256(compressed + 2, w0);
w1 = _mm256_or_si256(w1,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 4) ) , 4));
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 5) ) ;
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 29));
w0 = _mm256_srli_epi32(tmp,3);
_mm256_storeu_si256(compressed + 3, w1);
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 6) ) ;
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 22));
w1 = _mm256_srli_epi32(tmp,10);
_mm256_storeu_si256(compressed + 4, w0);
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 7) ) ;
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 15));
w0 = _mm256_srli_epi32(tmp,17);
_mm256_storeu_si256(compressed + 5, w1);
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 8) ) ;
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 8));
w1 = _mm256_srli_epi32(tmp,24);
_mm256_storeu_si256(compressed + 6, w0);
w1 = _mm256_or_si256(w1,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 9) ) , 1));
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 10) ) ;
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 26));
w0 = _mm256_srli_epi32(tmp,6);
_mm256_storeu_si256(compressed + 7, w1);
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 11) ) ;
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 19));
w1 = _mm256_srli_epi32(tmp,13);
_mm256_storeu_si256(compressed + 8, w0);
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 12) ) ;
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 12));
w0 = _mm256_srli_epi32(tmp,20);
_mm256_storeu_si256(compressed + 9, w1);
w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 13) ) , 5));
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 14) ) ;
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 30));
w1 = _mm256_srli_epi32(tmp,2);
_mm256_storeu_si256(compressed + 10, w0);
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 15) ) ;
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 23));
w0 = _mm256_srli_epi32(tmp,9);
_mm256_storeu_si256(compressed + 11, w1);
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 16) ) ;
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 16));
w1 = _mm256_srli_epi32(tmp,16);
_mm256_storeu_si256(compressed + 12, w0);
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 17) ) ;
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 9));
w0 = _mm256_srli_epi32(tmp,23);
_mm256_storeu_si256(compressed + 13, w1);
w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 18) ) , 2));
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 19) ) ;
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 27));
w1 = _mm256_srli_epi32(tmp,5);
_mm256_storeu_si256(compressed + 14, w0);
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 20) ) ;
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 20));
w0 = _mm256_srli_epi32(tmp,12);
_mm256_storeu_si256(compressed + 15, w1);
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 21) ) ;
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 13));
w1 = _mm256_srli_epi32(tmp,19);
_mm256_storeu_si256(compressed + 16, w0);
w1 = _mm256_or_si256(w1,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 22) ) , 6));
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 23) ) ;
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 31));
w0 = _mm256_srli_epi32(tmp,1);
_mm256_storeu_si256(compressed + 17, w1);
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 24) ) ;
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 24));
w1 = _mm256_srli_epi32(tmp,8);
_mm256_storeu_si256(compressed + 18, w0);
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 25) ) ;
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 17));
w0 = _mm256_srli_epi32(tmp,15);
_mm256_storeu_si256(compressed + 19, w1);
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 26) ) ;
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 10));
w1 = _mm256_srli_epi32(tmp,22);
_mm256_storeu_si256(compressed + 20, w0);
w1 = _mm256_or_si256(w1,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 27) ) , 3));
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 28) ) ;
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 28));
w0 = _mm256_srli_epi32(tmp,4);
_mm256_storeu_si256(compressed + 21, w1);
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 29) ) ;
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 21));
w1 = _mm256_srli_epi32(tmp,11);
_mm256_storeu_si256(compressed + 22, w0);
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 30) ) ;
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 14));
w0 = _mm256_srli_epi32(tmp,18);
_mm256_storeu_si256(compressed + 23, w1);
w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 31) ) , 7));
_mm256_storeu_si256(compressed + 24, w0);
}
/* we are going to pack 256 26-bit values, touching 26 256-bit words, using 416 bytes */
static void avxpackblockmask26(const uint32_t * pin, __m256i * compressed) {
/* we are going to touch 26 256-bit words */
__m256i w0, w1;
const __m256i * in = (const __m256i *) pin;
const __m256i mask = _mm256_set1_epi32(67108863);
__m256i tmp; /* used to store inputs at word boundary */
w0 = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 0) ) ;
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 1) ) ;
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 26));
w1 = _mm256_srli_epi32(tmp,6);
_mm256_storeu_si256(compressed + 0, w0);
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 2) ) ;
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 20));
w0 = _mm256_srli_epi32(tmp,12);
_mm256_storeu_si256(compressed + 1, w1);
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 3) ) ;
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 14));
w1 = _mm256_srli_epi32(tmp,18);
_mm256_storeu_si256(compressed + 2, w0);
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 4) ) ;
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 8));
w0 = _mm256_srli_epi32(tmp,24);
_mm256_storeu_si256(compressed + 3, w1);
w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 5) ) , 2));
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 6) ) ;
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 28));
w1 = _mm256_srli_epi32(tmp,4);
_mm256_storeu_si256(compressed + 4, w0);
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 7) ) ;
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 22));
w0 = _mm256_srli_epi32(tmp,10);
_mm256_storeu_si256(compressed + 5, w1);
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 8) ) ;
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 16));
w1 = _mm256_srli_epi32(tmp,16);
_mm256_storeu_si256(compressed + 6, w0);
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 9) ) ;
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 10));
w0 = _mm256_srli_epi32(tmp,22);
_mm256_storeu_si256(compressed + 7, w1);
w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 10) ) , 4));
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 11) ) ;
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 30));
w1 = _mm256_srli_epi32(tmp,2);
_mm256_storeu_si256(compressed + 8, w0);
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 12) ) ;
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 24));
w0 = _mm256_srli_epi32(tmp,8);
_mm256_storeu_si256(compressed + 9, w1);
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 13) ) ;
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 18));
w1 = _mm256_srli_epi32(tmp,14);
_mm256_storeu_si256(compressed + 10, w0);
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 14) ) ;
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 12));
w0 = _mm256_srli_epi32(tmp,20);
_mm256_storeu_si256(compressed + 11, w1);
w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 15) ) , 6));
_mm256_storeu_si256(compressed + 12, w0);
w1 = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 16) ) ;
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 17) ) ;
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 26));
w0 = _mm256_srli_epi32(tmp,6);
_mm256_storeu_si256(compressed + 13, w1);
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 18) ) ;
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 20));
w1 = _mm256_srli_epi32(tmp,12);
_mm256_storeu_si256(compressed + 14, w0);
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 19) ) ;
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 14));
w0 = _mm256_srli_epi32(tmp,18);
_mm256_storeu_si256(compressed + 15, w1);
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 20) ) ;
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 8));
w1 = _mm256_srli_epi32(tmp,24);
_mm256_storeu_si256(compressed + 16, w0);
w1 = _mm256_or_si256(w1,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 21) ) , 2));
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 22) ) ;
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 28));
w0 = _mm256_srli_epi32(tmp,4);
_mm256_storeu_si256(compressed + 17, w1);
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 23) ) ;
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 22));
w1 = _mm256_srli_epi32(tmp,10);
_mm256_storeu_si256(compressed + 18, w0);
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 24) ) ;
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 16));
w0 = _mm256_srli_epi32(tmp,16);
_mm256_storeu_si256(compressed + 19, w1);
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 25) ) ;
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 10));
w1 = _mm256_srli_epi32(tmp,22);
_mm256_storeu_si256(compressed + 20, w0);
w1 = _mm256_or_si256(w1,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 26) ) , 4));
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 27) ) ;
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 30));
w0 = _mm256_srli_epi32(tmp,2);
_mm256_storeu_si256(compressed + 21, w1);
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 28) ) ;
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 24));
w1 = _mm256_srli_epi32(tmp,8);
_mm256_storeu_si256(compressed + 22, w0);
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 29) ) ;
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 18));
w0 = _mm256_srli_epi32(tmp,14);
_mm256_storeu_si256(compressed + 23, w1);
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 30) ) ;
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 12));
w1 = _mm256_srli_epi32(tmp,20);
_mm256_storeu_si256(compressed + 24, w0);
w1 = _mm256_or_si256(w1,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 31) ) , 6));
_mm256_storeu_si256(compressed + 25, w1);
}
/* we are going to pack 256 27-bit values, touching 27 256-bit words, using 432 bytes */
static void avxpackblockmask27(const uint32_t * pin, __m256i * compressed) {
/* we are going to touch 27 256-bit words */
__m256i w0, w1;
const __m256i * in = (const __m256i *) pin;
const __m256i mask = _mm256_set1_epi32(134217727);
__m256i tmp; /* used to store inputs at word boundary */
w0 = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 0) ) ;
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 1) ) ;
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 27));
w1 = _mm256_srli_epi32(tmp,5);
_mm256_storeu_si256(compressed + 0, w0);
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 2) ) ;
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 22));
w0 = _mm256_srli_epi32(tmp,10);
_mm256_storeu_si256(compressed + 1, w1);
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 3) ) ;
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 17));
w1 = _mm256_srli_epi32(tmp,15);
_mm256_storeu_si256(compressed + 2, w0);
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 4) ) ;
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 12));
w0 = _mm256_srli_epi32(tmp,20);
_mm256_storeu_si256(compressed + 3, w1);
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 5) ) ;
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 7));
w1 = _mm256_srli_epi32(tmp,25);
_mm256_storeu_si256(compressed + 4, w0);
w1 = _mm256_or_si256(w1,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 6) ) , 2));
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 7) ) ;
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 29));
w0 = _mm256_srli_epi32(tmp,3);
_mm256_storeu_si256(compressed + 5, w1);
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 8) ) ;
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 24));
w1 = _mm256_srli_epi32(tmp,8);
_mm256_storeu_si256(compressed + 6, w0);
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 9) ) ;
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 19));
w0 = _mm256_srli_epi32(tmp,13);
_mm256_storeu_si256(compressed + 7, w1);
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 10) ) ;
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 14));
w1 = _mm256_srli_epi32(tmp,18);
_mm256_storeu_si256(compressed + 8, w0);
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 11) ) ;
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 9));
w0 = _mm256_srli_epi32(tmp,23);
_mm256_storeu_si256(compressed + 9, w1);
w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 12) ) , 4));
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 13) ) ;
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 31));
w1 = _mm256_srli_epi32(tmp,1);
_mm256_storeu_si256(compressed + 10, w0);
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 14) ) ;
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 26));
w0 = _mm256_srli_epi32(tmp,6);
_mm256_storeu_si256(compressed + 11, w1);
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 15) ) ;
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 21));
w1 = _mm256_srli_epi32(tmp,11);
_mm256_storeu_si256(compressed + 12, w0);
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 16) ) ;
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 16));
w0 = _mm256_srli_epi32(tmp,16);
_mm256_storeu_si256(compressed + 13, w1);
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 17) ) ;
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 11));
w1 = _mm256_srli_epi32(tmp,21);
_mm256_storeu_si256(compressed + 14, w0);
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 18) ) ;
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 6));
w0 = _mm256_srli_epi32(tmp,26);
_mm256_storeu_si256(compressed + 15, w1);
w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 19) ) , 1));
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 20) ) ;
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 28));
w1 = _mm256_srli_epi32(tmp,4);
_mm256_storeu_si256(compressed + 16, w0);
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 21) ) ;
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 23));
w0 = _mm256_srli_epi32(tmp,9);
_mm256_storeu_si256(compressed + 17, w1);
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 22) ) ;
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 18));
w1 = _mm256_srli_epi32(tmp,14);
_mm256_storeu_si256(compressed + 18, w0);
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 23) ) ;
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 13));
w0 = _mm256_srli_epi32(tmp,19);
_mm256_storeu_si256(compressed + 19, w1);
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 24) ) ;
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 8));
w1 = _mm256_srli_epi32(tmp,24);
_mm256_storeu_si256(compressed + 20, w0);
w1 = _mm256_or_si256(w1,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 25) ) , 3));
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 26) ) ;
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 30));
w0 = _mm256_srli_epi32(tmp,2);
_mm256_storeu_si256(compressed + 21, w1);
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 27) ) ;
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 25));
w1 = _mm256_srli_epi32(tmp,7);
_mm256_storeu_si256(compressed + 22, w0);
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 28) ) ;
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 20));
w0 = _mm256_srli_epi32(tmp,12);
_mm256_storeu_si256(compressed + 23, w1);
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 29) ) ;
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 15));
w1 = _mm256_srli_epi32(tmp,17);
_mm256_storeu_si256(compressed + 24, w0);
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 30) ) ;
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 10));
w0 = _mm256_srli_epi32(tmp,22);
_mm256_storeu_si256(compressed + 25, w1);
w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 31) ) , 5));
_mm256_storeu_si256(compressed + 26, w0);
}
/* we are going to pack 256 28-bit values, touching 28 256-bit words, using 448 bytes */
static void avxpackblockmask28(const uint32_t * pin, __m256i * compressed) {
/* we are going to touch 28 256-bit words */
__m256i w0, w1;
const __m256i * in = (const __m256i *) pin;
const __m256i mask = _mm256_set1_epi32(268435455);
__m256i tmp; /* used to store inputs at word boundary */
w0 = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 0) ) ;
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 1) ) ;
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 28));
w1 = _mm256_srli_epi32(tmp,4);
_mm256_storeu_si256(compressed + 0, w0);
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 2) ) ;
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 24));
w0 = _mm256_srli_epi32(tmp,8);
_mm256_storeu_si256(compressed + 1, w1);
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 3) ) ;
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 20));
w1 = _mm256_srli_epi32(tmp,12);
_mm256_storeu_si256(compressed + 2, w0);
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 4) ) ;
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 16));
w0 = _mm256_srli_epi32(tmp,16);
_mm256_storeu_si256(compressed + 3, w1);
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 5) ) ;
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 12));
w1 = _mm256_srli_epi32(tmp,20);
_mm256_storeu_si256(compressed + 4, w0);
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 6) ) ;
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 8));
w0 = _mm256_srli_epi32(tmp,24);
_mm256_storeu_si256(compressed + 5, w1);
w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 7) ) , 4));
_mm256_storeu_si256(compressed + 6, w0);
w1 = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 8) ) ;
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 9) ) ;
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 28));
w0 = _mm256_srli_epi32(tmp,4);
_mm256_storeu_si256(compressed + 7, w1);
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 10) ) ;
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 24));
w1 = _mm256_srli_epi32(tmp,8);
_mm256_storeu_si256(compressed + 8, w0);
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 11) ) ;
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 20));
w0 = _mm256_srli_epi32(tmp,12);
_mm256_storeu_si256(compressed + 9, w1);
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 12) ) ;
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 16));
w1 = _mm256_srli_epi32(tmp,16);
_mm256_storeu_si256(compressed + 10, w0);
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 13) ) ;
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 12));
w0 = _mm256_srli_epi32(tmp,20);
_mm256_storeu_si256(compressed + 11, w1);
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 14) ) ;
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 8));
w1 = _mm256_srli_epi32(tmp,24);
_mm256_storeu_si256(compressed + 12, w0);
w1 = _mm256_or_si256(w1,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 15) ) , 4));
_mm256_storeu_si256(compressed + 13, w1);
w0 = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 16) ) ;
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 17) ) ;
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 28));
w1 = _mm256_srli_epi32(tmp,4);
_mm256_storeu_si256(compressed + 14, w0);
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 18) ) ;
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 24));
w0 = _mm256_srli_epi32(tmp,8);
_mm256_storeu_si256(compressed + 15, w1);
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 19) ) ;
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 20));
w1 = _mm256_srli_epi32(tmp,12);
_mm256_storeu_si256(compressed + 16, w0);
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 20) ) ;
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 16));
w0 = _mm256_srli_epi32(tmp,16);
_mm256_storeu_si256(compressed + 17, w1);
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 21) ) ;
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 12));
w1 = _mm256_srli_epi32(tmp,20);
_mm256_storeu_si256(compressed + 18, w0);
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 22) ) ;
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 8));
w0 = _mm256_srli_epi32(tmp,24);
_mm256_storeu_si256(compressed + 19, w1);
w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 23) ) , 4));
_mm256_storeu_si256(compressed + 20, w0);
w1 = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 24) ) ;
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 25) ) ;
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 28));
w0 = _mm256_srli_epi32(tmp,4);
_mm256_storeu_si256(compressed + 21, w1);
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 26) ) ;
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 24));
w1 = _mm256_srli_epi32(tmp,8);
_mm256_storeu_si256(compressed + 22, w0);
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 27) ) ;
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 20));
w0 = _mm256_srli_epi32(tmp,12);
_mm256_storeu_si256(compressed + 23, w1);
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 28) ) ;
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 16));
w1 = _mm256_srli_epi32(tmp,16);
_mm256_storeu_si256(compressed + 24, w0);
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 29) ) ;
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 12));
w0 = _mm256_srli_epi32(tmp,20);
_mm256_storeu_si256(compressed + 25, w1);
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 30) ) ;
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 8));
w1 = _mm256_srli_epi32(tmp,24);
_mm256_storeu_si256(compressed + 26, w0);
w1 = _mm256_or_si256(w1,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 31) ) , 4));
_mm256_storeu_si256(compressed + 27, w1);
}
/* we are going to pack 256 29-bit values, touching 29 256-bit words, using 464 bytes */
static void avxpackblockmask29(const uint32_t * pin, __m256i * compressed) {
/* we are going to touch 29 256-bit words */
__m256i w0, w1;
const __m256i * in = (const __m256i *) pin;
const __m256i mask = _mm256_set1_epi32(536870911);
__m256i tmp; /* used to store inputs at word boundary */
w0 = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 0) ) ;
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 1) ) ;
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 29));
w1 = _mm256_srli_epi32(tmp,3);
_mm256_storeu_si256(compressed + 0, w0);
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 2) ) ;
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 26));
w0 = _mm256_srli_epi32(tmp,6);
_mm256_storeu_si256(compressed + 1, w1);
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 3) ) ;
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 23));
w1 = _mm256_srli_epi32(tmp,9);
_mm256_storeu_si256(compressed + 2, w0);
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 4) ) ;
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 20));
w0 = _mm256_srli_epi32(tmp,12);
_mm256_storeu_si256(compressed + 3, w1);
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 5) ) ;
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 17));
w1 = _mm256_srli_epi32(tmp,15);
_mm256_storeu_si256(compressed + 4, w0);
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 6) ) ;
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 14));
w0 = _mm256_srli_epi32(tmp,18);
_mm256_storeu_si256(compressed + 5, w1);
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 7) ) ;
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 11));
w1 = _mm256_srli_epi32(tmp,21);
_mm256_storeu_si256(compressed + 6, w0);
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 8) ) ;
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 8));
w0 = _mm256_srli_epi32(tmp,24);
_mm256_storeu_si256(compressed + 7, w1);
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 9) ) ;
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 5));
w1 = _mm256_srli_epi32(tmp,27);
_mm256_storeu_si256(compressed + 8, w0);
w1 = _mm256_or_si256(w1,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 10) ) , 2));
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 11) ) ;
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 31));
w0 = _mm256_srli_epi32(tmp,1);
_mm256_storeu_si256(compressed + 9, w1);
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 12) ) ;
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 28));
w1 = _mm256_srli_epi32(tmp,4);
_mm256_storeu_si256(compressed + 10, w0);
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 13) ) ;
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 25));
w0 = _mm256_srli_epi32(tmp,7);
_mm256_storeu_si256(compressed + 11, w1);
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 14) ) ;
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 22));
w1 = _mm256_srli_epi32(tmp,10);
_mm256_storeu_si256(compressed + 12, w0);
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 15) ) ;
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 19));
w0 = _mm256_srli_epi32(tmp,13);
_mm256_storeu_si256(compressed + 13, w1);
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 16) ) ;
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 16));
w1 = _mm256_srli_epi32(tmp,16);
_mm256_storeu_si256(compressed + 14, w0);
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 17) ) ;
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 13));
w0 = _mm256_srli_epi32(tmp,19);
_mm256_storeu_si256(compressed + 15, w1);
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 18) ) ;
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 10));
w1 = _mm256_srli_epi32(tmp,22);
_mm256_storeu_si256(compressed + 16, w0);
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 19) ) ;
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 7));
w0 = _mm256_srli_epi32(tmp,25);
_mm256_storeu_si256(compressed + 17, w1);
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 20) ) ;
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 4));
w1 = _mm256_srli_epi32(tmp,28);
_mm256_storeu_si256(compressed + 18, w0);
w1 = _mm256_or_si256(w1,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 21) ) , 1));
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 22) ) ;
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 30));
w0 = _mm256_srli_epi32(tmp,2);
_mm256_storeu_si256(compressed + 19, w1);
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 23) ) ;
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 27));
w1 = _mm256_srli_epi32(tmp,5);
_mm256_storeu_si256(compressed + 20, w0);
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 24) ) ;
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 24));
w0 = _mm256_srli_epi32(tmp,8);
_mm256_storeu_si256(compressed + 21, w1);
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 25) ) ;
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 21));
w1 = _mm256_srli_epi32(tmp,11);
_mm256_storeu_si256(compressed + 22, w0);
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 26) ) ;
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 18));
w0 = _mm256_srli_epi32(tmp,14);
_mm256_storeu_si256(compressed + 23, w1);
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 27) ) ;
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 15));
w1 = _mm256_srli_epi32(tmp,17);
_mm256_storeu_si256(compressed + 24, w0);
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 28) ) ;
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 12));
w0 = _mm256_srli_epi32(tmp,20);
_mm256_storeu_si256(compressed + 25, w1);
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 29) ) ;
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 9));
w1 = _mm256_srli_epi32(tmp,23);
_mm256_storeu_si256(compressed + 26, w0);
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 30) ) ;
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 6));
w0 = _mm256_srli_epi32(tmp,26);
_mm256_storeu_si256(compressed + 27, w1);
w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 31) ) , 3));
_mm256_storeu_si256(compressed + 28, w0);
}
/* we are going to pack 256 30-bit values, touching 30 256-bit words, using 480 bytes */
static void avxpackblockmask30(const uint32_t * pin, __m256i * compressed) {
/* we are going to touch 30 256-bit words */
__m256i w0, w1;
const __m256i * in = (const __m256i *) pin;
const __m256i mask = _mm256_set1_epi32(1073741823);
__m256i tmp; /* used to store inputs at word boundary */
w0 = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 0) ) ;
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 1) ) ;
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 30));
w1 = _mm256_srli_epi32(tmp,2);
_mm256_storeu_si256(compressed + 0, w0);
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 2) ) ;
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 28));
w0 = _mm256_srli_epi32(tmp,4);
_mm256_storeu_si256(compressed + 1, w1);
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 3) ) ;
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 26));
w1 = _mm256_srli_epi32(tmp,6);
_mm256_storeu_si256(compressed + 2, w0);
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 4) ) ;
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 24));
w0 = _mm256_srli_epi32(tmp,8);
_mm256_storeu_si256(compressed + 3, w1);
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 5) ) ;
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 22));
w1 = _mm256_srli_epi32(tmp,10);
_mm256_storeu_si256(compressed + 4, w0);
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 6) ) ;
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 20));
w0 = _mm256_srli_epi32(tmp,12);
_mm256_storeu_si256(compressed + 5, w1);
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 7) ) ;
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 18));
w1 = _mm256_srli_epi32(tmp,14);
_mm256_storeu_si256(compressed + 6, w0);
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 8) ) ;
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 16));
w0 = _mm256_srli_epi32(tmp,16);
_mm256_storeu_si256(compressed + 7, w1);
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 9) ) ;
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 14));
w1 = _mm256_srli_epi32(tmp,18);
_mm256_storeu_si256(compressed + 8, w0);
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 10) ) ;
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 12));
w0 = _mm256_srli_epi32(tmp,20);
_mm256_storeu_si256(compressed + 9, w1);
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 11) ) ;
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 10));
w1 = _mm256_srli_epi32(tmp,22);
_mm256_storeu_si256(compressed + 10, w0);
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 12) ) ;
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 8));
w0 = _mm256_srli_epi32(tmp,24);
_mm256_storeu_si256(compressed + 11, w1);
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 13) ) ;
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 6));
w1 = _mm256_srli_epi32(tmp,26);
_mm256_storeu_si256(compressed + 12, w0);
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 14) ) ;
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 4));
w0 = _mm256_srli_epi32(tmp,28);
_mm256_storeu_si256(compressed + 13, w1);
w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 15) ) , 2));
_mm256_storeu_si256(compressed + 14, w0);
w1 = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 16) ) ;
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 17) ) ;
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 30));
w0 = _mm256_srli_epi32(tmp,2);
_mm256_storeu_si256(compressed + 15, w1);
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 18) ) ;
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 28));
w1 = _mm256_srli_epi32(tmp,4);
_mm256_storeu_si256(compressed + 16, w0);
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 19) ) ;
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 26));
w0 = _mm256_srli_epi32(tmp,6);
_mm256_storeu_si256(compressed + 17, w1);
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 20) ) ;
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 24));
w1 = _mm256_srli_epi32(tmp,8);
_mm256_storeu_si256(compressed + 18, w0);
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 21) ) ;
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 22));
w0 = _mm256_srli_epi32(tmp,10);
_mm256_storeu_si256(compressed + 19, w1);
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 22) ) ;
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 20));
w1 = _mm256_srli_epi32(tmp,12);
_mm256_storeu_si256(compressed + 20, w0);
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 23) ) ;
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 18));
w0 = _mm256_srli_epi32(tmp,14);
_mm256_storeu_si256(compressed + 21, w1);
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 24) ) ;
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 16));
w1 = _mm256_srli_epi32(tmp,16);
_mm256_storeu_si256(compressed + 22, w0);
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 25) ) ;
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 14));
w0 = _mm256_srli_epi32(tmp,18);
_mm256_storeu_si256(compressed + 23, w1);
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 26) ) ;
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 12));
w1 = _mm256_srli_epi32(tmp,20);
_mm256_storeu_si256(compressed + 24, w0);
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 27) ) ;
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 10));
w0 = _mm256_srli_epi32(tmp,22);
_mm256_storeu_si256(compressed + 25, w1);
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 28) ) ;
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 8));
w1 = _mm256_srli_epi32(tmp,24);
_mm256_storeu_si256(compressed + 26, w0);
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 29) ) ;
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 6));
w0 = _mm256_srli_epi32(tmp,26);
_mm256_storeu_si256(compressed + 27, w1);
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 30) ) ;
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 4));
w1 = _mm256_srli_epi32(tmp,28);
_mm256_storeu_si256(compressed + 28, w0);
w1 = _mm256_or_si256(w1,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 31) ) , 2));
_mm256_storeu_si256(compressed + 29, w1);
}
/* we are going to pack 256 31-bit values, touching 31 256-bit words, using 496 bytes */
static void avxpackblockmask31(const uint32_t * pin, __m256i * compressed) {
/* we are going to touch 31 256-bit words */
__m256i w0, w1;
const __m256i * in = (const __m256i *) pin;
const __m256i mask = _mm256_set1_epi32(2147483647);
__m256i tmp; /* used to store inputs at word boundary */
w0 = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 0) ) ;
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 1) ) ;
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 31));
w1 = _mm256_srli_epi32(tmp,1);
_mm256_storeu_si256(compressed + 0, w0);
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 2) ) ;
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 30));
w0 = _mm256_srli_epi32(tmp,2);
_mm256_storeu_si256(compressed + 1, w1);
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 3) ) ;
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 29));
w1 = _mm256_srli_epi32(tmp,3);
_mm256_storeu_si256(compressed + 2, w0);
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 4) ) ;
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 28));
w0 = _mm256_srli_epi32(tmp,4);
_mm256_storeu_si256(compressed + 3, w1);
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 5) ) ;
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 27));
w1 = _mm256_srli_epi32(tmp,5);
_mm256_storeu_si256(compressed + 4, w0);
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 6) ) ;
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 26));
w0 = _mm256_srli_epi32(tmp,6);
_mm256_storeu_si256(compressed + 5, w1);
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 7) ) ;
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 25));
w1 = _mm256_srli_epi32(tmp,7);
_mm256_storeu_si256(compressed + 6, w0);
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 8) ) ;
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 24));
w0 = _mm256_srli_epi32(tmp,8);
_mm256_storeu_si256(compressed + 7, w1);
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 9) ) ;
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 23));
w1 = _mm256_srli_epi32(tmp,9);
_mm256_storeu_si256(compressed + 8, w0);
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 10) ) ;
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 22));
w0 = _mm256_srli_epi32(tmp,10);
_mm256_storeu_si256(compressed + 9, w1);
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 11) ) ;
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 21));
w1 = _mm256_srli_epi32(tmp,11);
_mm256_storeu_si256(compressed + 10, w0);
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 12) ) ;
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 20));
w0 = _mm256_srli_epi32(tmp,12);
_mm256_storeu_si256(compressed + 11, w1);
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 13) ) ;
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 19));
w1 = _mm256_srli_epi32(tmp,13);
_mm256_storeu_si256(compressed + 12, w0);
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 14) ) ;
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 18));
w0 = _mm256_srli_epi32(tmp,14);
_mm256_storeu_si256(compressed + 13, w1);
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 15) ) ;
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 17));
w1 = _mm256_srli_epi32(tmp,15);
_mm256_storeu_si256(compressed + 14, w0);
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 16) ) ;
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 16));
w0 = _mm256_srli_epi32(tmp,16);
_mm256_storeu_si256(compressed + 15, w1);
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 17) ) ;
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 15));
w1 = _mm256_srli_epi32(tmp,17);
_mm256_storeu_si256(compressed + 16, w0);
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 18) ) ;
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 14));
w0 = _mm256_srli_epi32(tmp,18);
_mm256_storeu_si256(compressed + 17, w1);
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 19) ) ;
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 13));
w1 = _mm256_srli_epi32(tmp,19);
_mm256_storeu_si256(compressed + 18, w0);
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 20) ) ;
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 12));
w0 = _mm256_srli_epi32(tmp,20);
_mm256_storeu_si256(compressed + 19, w1);
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 21) ) ;
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 11));
w1 = _mm256_srli_epi32(tmp,21);
_mm256_storeu_si256(compressed + 20, w0);
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 22) ) ;
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 10));
w0 = _mm256_srli_epi32(tmp,22);
_mm256_storeu_si256(compressed + 21, w1);
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 23) ) ;
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 9));
w1 = _mm256_srli_epi32(tmp,23);
_mm256_storeu_si256(compressed + 22, w0);
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 24) ) ;
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 8));
w0 = _mm256_srli_epi32(tmp,24);
_mm256_storeu_si256(compressed + 23, w1);
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 25) ) ;
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 7));
w1 = _mm256_srli_epi32(tmp,25);
_mm256_storeu_si256(compressed + 24, w0);
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 26) ) ;
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 6));
w0 = _mm256_srli_epi32(tmp,26);
_mm256_storeu_si256(compressed + 25, w1);
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 27) ) ;
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 5));
w1 = _mm256_srli_epi32(tmp,27);
_mm256_storeu_si256(compressed + 26, w0);
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 28) ) ;
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 4));
w0 = _mm256_srli_epi32(tmp,28);
_mm256_storeu_si256(compressed + 27, w1);
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 29) ) ;
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 3));
w1 = _mm256_srli_epi32(tmp,29);
_mm256_storeu_si256(compressed + 28, w0);
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 30) ) ;
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 2));
w0 = _mm256_srli_epi32(tmp,30);
_mm256_storeu_si256(compressed + 29, w1);
w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 31) ) , 1));
_mm256_storeu_si256(compressed + 30, w0);
}
/* we are going to pack 256 32-bit values, touching 32 256-bit words, using 512 bytes */
static void avxpackblockmask32(const uint32_t * pin, __m256i * compressed) {
/* we are going to touch 32 256-bit words */
__m256i w0, w1;
const __m256i * in = (const __m256i *) pin;
w0 = _mm256_lddqu_si256 (in + 0) ;
_mm256_storeu_si256(compressed + 0, w0);
w1 = _mm256_lddqu_si256 (in + 1) ;
_mm256_storeu_si256(compressed + 1, w1);
w0 = _mm256_lddqu_si256 (in + 2) ;
_mm256_storeu_si256(compressed + 2, w0);
w1 = _mm256_lddqu_si256 (in + 3) ;
_mm256_storeu_si256(compressed + 3, w1);
w0 = _mm256_lddqu_si256 (in + 4) ;
_mm256_storeu_si256(compressed + 4, w0);
w1 = _mm256_lddqu_si256 (in + 5) ;
_mm256_storeu_si256(compressed + 5, w1);
w0 = _mm256_lddqu_si256 (in + 6) ;
_mm256_storeu_si256(compressed + 6, w0);
w1 = _mm256_lddqu_si256 (in + 7) ;
_mm256_storeu_si256(compressed + 7, w1);
w0 = _mm256_lddqu_si256 (in + 8) ;
_mm256_storeu_si256(compressed + 8, w0);
w1 = _mm256_lddqu_si256 (in + 9) ;
_mm256_storeu_si256(compressed + 9, w1);
w0 = _mm256_lddqu_si256 (in + 10) ;
_mm256_storeu_si256(compressed + 10, w0);
w1 = _mm256_lddqu_si256 (in + 11) ;
_mm256_storeu_si256(compressed + 11, w1);
w0 = _mm256_lddqu_si256 (in + 12) ;
_mm256_storeu_si256(compressed + 12, w0);
w1 = _mm256_lddqu_si256 (in + 13) ;
_mm256_storeu_si256(compressed + 13, w1);
w0 = _mm256_lddqu_si256 (in + 14) ;
_mm256_storeu_si256(compressed + 14, w0);
w1 = _mm256_lddqu_si256 (in + 15) ;
_mm256_storeu_si256(compressed + 15, w1);
w0 = _mm256_lddqu_si256 (in + 16) ;
_mm256_storeu_si256(compressed + 16, w0);
w1 = _mm256_lddqu_si256 (in + 17) ;
_mm256_storeu_si256(compressed + 17, w1);
w0 = _mm256_lddqu_si256 (in + 18) ;
_mm256_storeu_si256(compressed + 18, w0);
w1 = _mm256_lddqu_si256 (in + 19) ;
_mm256_storeu_si256(compressed + 19, w1);
w0 = _mm256_lddqu_si256 (in + 20) ;
_mm256_storeu_si256(compressed + 20, w0);
w1 = _mm256_lddqu_si256 (in + 21) ;
_mm256_storeu_si256(compressed + 21, w1);
w0 = _mm256_lddqu_si256 (in + 22) ;
_mm256_storeu_si256(compressed + 22, w0);
w1 = _mm256_lddqu_si256 (in + 23) ;
_mm256_storeu_si256(compressed + 23, w1);
w0 = _mm256_lddqu_si256 (in + 24) ;
_mm256_storeu_si256(compressed + 24, w0);
w1 = _mm256_lddqu_si256 (in + 25) ;
_mm256_storeu_si256(compressed + 25, w1);
w0 = _mm256_lddqu_si256 (in + 26) ;
_mm256_storeu_si256(compressed + 26, w0);
w1 = _mm256_lddqu_si256 (in + 27) ;
_mm256_storeu_si256(compressed + 27, w1);
w0 = _mm256_lddqu_si256 (in + 28) ;
_mm256_storeu_si256(compressed + 28, w0);
w1 = _mm256_lddqu_si256 (in + 29) ;
_mm256_storeu_si256(compressed + 29, w1);
w0 = _mm256_lddqu_si256 (in + 30) ;
_mm256_storeu_si256(compressed + 30, w0);
w1 = _mm256_lddqu_si256 (in + 31) ;
_mm256_storeu_si256(compressed + 31, w1);
}
static void avxunpackblock0(const __m256i * compressed, uint32_t * pout) {
(void) compressed;
memset(pout,0,256);
}
/* we packed 256 1-bit values, touching 1 256-bit words, using 16 bytes */
static void avxunpackblock1(const __m256i * compressed, uint32_t * pout) {
/* we are going to access 1 256-bit word */
__m256i w0;
__m256i * out = (__m256i *) pout;
const __m256i mask = _mm256_set1_epi32(1);
w0 = _mm256_lddqu_si256 (compressed);
_mm256_storeu_si256(out + 0, _mm256_and_si256 ( mask, w0 ) );
_mm256_storeu_si256(out + 1, _mm256_and_si256 ( mask, _mm256_srli_epi32( w0 , 1) ) );
_mm256_storeu_si256(out + 2, _mm256_and_si256 ( mask, _mm256_srli_epi32( w0 , 2) ) );
_mm256_storeu_si256(out + 3, _mm256_and_si256 ( mask, _mm256_srli_epi32( w0 , 3) ) );
_mm256_storeu_si256(out + 4, _mm256_and_si256 ( mask, _mm256_srli_epi32( w0 , 4) ) );
_mm256_storeu_si256(out + 5, _mm256_and_si256 ( mask, _mm256_srli_epi32( w0 , 5) ) );
_mm256_storeu_si256(out + 6, _mm256_and_si256 ( mask, _mm256_srli_epi32( w0 , 6) ) );
_mm256_storeu_si256(out + 7, _mm256_and_si256 ( mask, _mm256_srli_epi32( w0 , 7) ) );
_mm256_storeu_si256(out + 8, _mm256_and_si256 ( mask, _mm256_srli_epi32( w0 , 8) ) );
_mm256_storeu_si256(out + 9, _mm256_and_si256 ( mask, _mm256_srli_epi32( w0 , 9) ) );
_mm256_storeu_si256(out + 10, _mm256_and_si256 ( mask, _mm256_srli_epi32( w0 , 10) ) );
_mm256_storeu_si256(out + 11, _mm256_and_si256 ( mask, _mm256_srli_epi32( w0 , 11) ) );
_mm256_storeu_si256(out + 12, _mm256_and_si256 ( mask, _mm256_srli_epi32( w0 , 12) ) );
_mm256_storeu_si256(out + 13, _mm256_and_si256 ( mask, _mm256_srli_epi32( w0 , 13) ) );
_mm256_storeu_si256(out + 14, _mm256_and_si256 ( mask, _mm256_srli_epi32( w0 , 14) ) );
_mm256_storeu_si256(out + 15, _mm256_and_si256 ( mask, _mm256_srli_epi32( w0 , 15) ) );
_mm256_storeu_si256(out + 16, _mm256_and_si256 ( mask, _mm256_srli_epi32( w0 , 16) ) );
_mm256_storeu_si256(out + 17, _mm256_and_si256 ( mask, _mm256_srli_epi32( w0 , 17) ) );
_mm256_storeu_si256(out + 18, _mm256_and_si256 ( mask, _mm256_srli_epi32( w0 , 18) ) );
_mm256_storeu_si256(out + 19, _mm256_and_si256 ( mask, _mm256_srli_epi32( w0 , 19) ) );
_mm256_storeu_si256(out + 20, _mm256_and_si256 ( mask, _mm256_srli_epi32( w0 , 20) ) );
_mm256_storeu_si256(out + 21, _mm256_and_si256 ( mask, _mm256_srli_epi32( w0 , 21) ) );
_mm256_storeu_si256(out + 22, _mm256_and_si256 ( mask, _mm256_srli_epi32( w0 , 22) ) );
_mm256_storeu_si256(out + 23, _mm256_and_si256 ( mask, _mm256_srli_epi32( w0 , 23) ) );
_mm256_storeu_si256(out + 24, _mm256_and_si256 ( mask, _mm256_srli_epi32( w0 , 24) ) );
_mm256_storeu_si256(out + 25, _mm256_and_si256 ( mask, _mm256_srli_epi32( w0 , 25) ) );
_mm256_storeu_si256(out + 26, _mm256_and_si256 ( mask, _mm256_srli_epi32( w0 , 26) ) );
_mm256_storeu_si256(out + 27, _mm256_and_si256 ( mask, _mm256_srli_epi32( w0 , 27) ) );
_mm256_storeu_si256(out + 28, _mm256_and_si256 ( mask, _mm256_srli_epi32( w0 , 28) ) );
_mm256_storeu_si256(out + 29, _mm256_and_si256 ( mask, _mm256_srli_epi32( w0 , 29) ) );
_mm256_storeu_si256(out + 30, _mm256_and_si256 ( mask, _mm256_srli_epi32( w0 , 30) ) );
_mm256_storeu_si256(out + 31, _mm256_srli_epi32( w0 , 31) );
}
/* we packed 256 2-bit values, touching 2 256-bit words, using 32 bytes */
static void avxunpackblock2(const __m256i * compressed, uint32_t * pout) {
/* we are going to access 2 256-bit words */
__m256i w0, w1;
__m256i * out = (__m256i *) pout;
const __m256i mask = _mm256_set1_epi32(3);
w0 = _mm256_lddqu_si256 (compressed);
_mm256_storeu_si256(out + 0, _mm256_and_si256 ( mask, w0 ) );
_mm256_storeu_si256(out + 1, _mm256_and_si256 ( mask, _mm256_srli_epi32( w0 , 2) ) );
_mm256_storeu_si256(out + 2, _mm256_and_si256 ( mask, _mm256_srli_epi32( w0 , 4) ) );
_mm256_storeu_si256(out + 3, _mm256_and_si256 ( mask, _mm256_srli_epi32( w0 , 6) ) );
_mm256_storeu_si256(out + 4, _mm256_and_si256 ( mask, _mm256_srli_epi32( w0 , 8) ) );
_mm256_storeu_si256(out + 5, _mm256_and_si256 ( mask, _mm256_srli_epi32( w0 , 10) ) );
_mm256_storeu_si256(out + 6, _mm256_and_si256 ( mask, _mm256_srli_epi32( w0 , 12) ) );
_mm256_storeu_si256(out + 7, _mm256_and_si256 ( mask, _mm256_srli_epi32( w0 , 14) ) );
_mm256_storeu_si256(out + 8, _mm256_and_si256 ( mask, _mm256_srli_epi32( w0 , 16) ) );
_mm256_storeu_si256(out + 9, _mm256_and_si256 ( mask, _mm256_srli_epi32( w0 , 18) ) );
_mm256_storeu_si256(out + 10, _mm256_and_si256 ( mask, _mm256_srli_epi32( w0 , 20) ) );
_mm256_storeu_si256(out + 11, _mm256_and_si256 ( mask, _mm256_srli_epi32( w0 , 22) ) );
_mm256_storeu_si256(out + 12, _mm256_and_si256 ( mask, _mm256_srli_epi32( w0 , 24) ) );
_mm256_storeu_si256(out + 13, _mm256_and_si256 ( mask, _mm256_srli_epi32( w0 , 26) ) );
_mm256_storeu_si256(out + 14, _mm256_and_si256 ( mask, _mm256_srli_epi32( w0 , 28) ) );
_mm256_storeu_si256(out + 15, _mm256_srli_epi32( w0 , 30) );
w1 = _mm256_lddqu_si256 (compressed + 1);
_mm256_storeu_si256(out + 16, _mm256_and_si256 ( mask, w1 ) );
_mm256_storeu_si256(out + 17, _mm256_and_si256 ( mask, _mm256_srli_epi32( w1 , 2) ) );
_mm256_storeu_si256(out + 18, _mm256_and_si256 ( mask, _mm256_srli_epi32( w1 , 4) ) );
_mm256_storeu_si256(out + 19, _mm256_and_si256 ( mask, _mm256_srli_epi32( w1 , 6) ) );
_mm256_storeu_si256(out + 20, _mm256_and_si256 ( mask, _mm256_srli_epi32( w1 , 8) ) );
_mm256_storeu_si256(out + 21, _mm256_and_si256 ( mask, _mm256_srli_epi32( w1 , 10) ) );
_mm256_storeu_si256(out + 22, _mm256_and_si256 ( mask, _mm256_srli_epi32( w1 , 12) ) );
_mm256_storeu_si256(out + 23, _mm256_and_si256 ( mask, _mm256_srli_epi32( w1 , 14) ) );
_mm256_storeu_si256(out + 24, _mm256_and_si256 ( mask, _mm256_srli_epi32( w1 , 16) ) );
_mm256_storeu_si256(out + 25, _mm256_and_si256 ( mask, _mm256_srli_epi32( w1 , 18) ) );
_mm256_storeu_si256(out + 26, _mm256_and_si256 ( mask, _mm256_srli_epi32( w1 , 20) ) );
_mm256_storeu_si256(out + 27, _mm256_and_si256 ( mask, _mm256_srli_epi32( w1 , 22) ) );
_mm256_storeu_si256(out + 28, _mm256_and_si256 ( mask, _mm256_srli_epi32( w1 , 24) ) );
_mm256_storeu_si256(out + 29, _mm256_and_si256 ( mask, _mm256_srli_epi32( w1 , 26) ) );
_mm256_storeu_si256(out + 30, _mm256_and_si256 ( mask, _mm256_srli_epi32( w1 , 28) ) );
_mm256_storeu_si256(out + 31, _mm256_srli_epi32( w1 , 30) );
}
/* we packed 256 3-bit values, touching 3 256-bit words, using 48 bytes */
static void avxunpackblock3(const __m256i * compressed, uint32_t * pout) {
/* we are going to access 3 256-bit words */
__m256i w0, w1;
__m256i * out = (__m256i *) pout;
const __m256i mask = _mm256_set1_epi32(7);
w0 = _mm256_lddqu_si256 (compressed);
_mm256_storeu_si256(out + 0, _mm256_and_si256 ( mask, w0 ) );
_mm256_storeu_si256(out + 1, _mm256_and_si256 ( mask, _mm256_srli_epi32( w0 , 3) ) );
_mm256_storeu_si256(out + 2, _mm256_and_si256 ( mask, _mm256_srli_epi32( w0 , 6) ) );
_mm256_storeu_si256(out + 3, _mm256_and_si256 ( mask, _mm256_srli_epi32( w0 , 9) ) );
_mm256_storeu_si256(out + 4, _mm256_and_si256 ( mask, _mm256_srli_epi32( w0 , 12) ) );
_mm256_storeu_si256(out + 5, _mm256_and_si256 ( mask, _mm256_srli_epi32( w0 , 15) ) );
_mm256_storeu_si256(out + 6, _mm256_and_si256 ( mask, _mm256_srli_epi32( w0 , 18) ) );
_mm256_storeu_si256(out + 7, _mm256_and_si256 ( mask, _mm256_srli_epi32( w0 , 21) ) );
_mm256_storeu_si256(out + 8, _mm256_and_si256 ( mask, _mm256_srli_epi32( w0 , 24) ) );
_mm256_storeu_si256(out + 9, _mm256_and_si256 ( mask, _mm256_srli_epi32( w0 , 27) ) );
w1 = _mm256_lddqu_si256 (compressed + 1);
_mm256_storeu_si256(out + 10,
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w0 , 30) ,_mm256_slli_epi32( w1 , 2 ) ) ) );
_mm256_storeu_si256(out + 11, _mm256_and_si256 ( mask, _mm256_srli_epi32( w1 , 1) ) );
_mm256_storeu_si256(out + 12, _mm256_and_si256 ( mask, _mm256_srli_epi32( w1 , 4) ) );
_mm256_storeu_si256(out + 13, _mm256_and_si256 ( mask, _mm256_srli_epi32( w1 , 7) ) );
_mm256_storeu_si256(out + 14, _mm256_and_si256 ( mask, _mm256_srli_epi32( w1 , 10) ) );
_mm256_storeu_si256(out + 15, _mm256_and_si256 ( mask, _mm256_srli_epi32( w1 , 13) ) );
_mm256_storeu_si256(out + 16, _mm256_and_si256 ( mask, _mm256_srli_epi32( w1 , 16) ) );
_mm256_storeu_si256(out + 17, _mm256_and_si256 ( mask, _mm256_srli_epi32( w1 , 19) ) );
_mm256_storeu_si256(out + 18, _mm256_and_si256 ( mask, _mm256_srli_epi32( w1 , 22) ) );
_mm256_storeu_si256(out + 19, _mm256_and_si256 ( mask, _mm256_srli_epi32( w1 , 25) ) );
_mm256_storeu_si256(out + 20, _mm256_and_si256 ( mask, _mm256_srli_epi32( w1 , 28) ) );
w0 = _mm256_lddqu_si256 (compressed + 2);
_mm256_storeu_si256(out + 21,
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w1 , 31) ,_mm256_slli_epi32( w0 , 1 ) ) ) );
_mm256_storeu_si256(out + 22, _mm256_and_si256 ( mask, _mm256_srli_epi32( w0 , 2) ) );
_mm256_storeu_si256(out + 23, _mm256_and_si256 ( mask, _mm256_srli_epi32( w0 , 5) ) );
_mm256_storeu_si256(out + 24, _mm256_and_si256 ( mask, _mm256_srli_epi32( w0 , 8) ) );
_mm256_storeu_si256(out + 25, _mm256_and_si256 ( mask, _mm256_srli_epi32( w0 , 11) ) );
_mm256_storeu_si256(out + 26, _mm256_and_si256 ( mask, _mm256_srli_epi32( w0 , 14) ) );
_mm256_storeu_si256(out + 27, _mm256_and_si256 ( mask, _mm256_srli_epi32( w0 , 17) ) );
_mm256_storeu_si256(out + 28, _mm256_and_si256 ( mask, _mm256_srli_epi32( w0 , 20) ) );
_mm256_storeu_si256(out + 29, _mm256_and_si256 ( mask, _mm256_srli_epi32( w0 , 23) ) );
_mm256_storeu_si256(out + 30, _mm256_and_si256 ( mask, _mm256_srli_epi32( w0 , 26) ) );
_mm256_storeu_si256(out + 31, _mm256_srli_epi32( w0 , 29) );
}
/* we packed 256 4-bit values, touching 4 256-bit words, using 64 bytes */
static void avxunpackblock4(const __m256i * compressed, uint32_t * pout) {
/* we are going to access 4 256-bit words */
__m256i w0, w1;
__m256i * out = (__m256i *) pout;
const __m256i mask = _mm256_set1_epi32(15);
w0 = _mm256_lddqu_si256 (compressed);
_mm256_storeu_si256(out + 0, _mm256_and_si256 ( mask, w0 ) );
_mm256_storeu_si256(out + 1, _mm256_and_si256 ( mask, _mm256_srli_epi32( w0 , 4) ) );
_mm256_storeu_si256(out + 2, _mm256_and_si256 ( mask, _mm256_srli_epi32( w0 , 8) ) );
_mm256_storeu_si256(out + 3, _mm256_and_si256 ( mask, _mm256_srli_epi32( w0 , 12) ) );
_mm256_storeu_si256(out + 4, _mm256_and_si256 ( mask, _mm256_srli_epi32( w0 , 16) ) );
_mm256_storeu_si256(out + 5, _mm256_and_si256 ( mask, _mm256_srli_epi32( w0 , 20) ) );
_mm256_storeu_si256(out + 6, _mm256_and_si256 ( mask, _mm256_srli_epi32( w0 , 24) ) );
_mm256_storeu_si256(out + 7, _mm256_srli_epi32( w0 , 28) );
w1 = _mm256_lddqu_si256 (compressed + 1);
_mm256_storeu_si256(out + 8, _mm256_and_si256 ( mask, w1 ) );
_mm256_storeu_si256(out + 9, _mm256_and_si256 ( mask, _mm256_srli_epi32( w1 , 4) ) );
_mm256_storeu_si256(out + 10, _mm256_and_si256 ( mask, _mm256_srli_epi32( w1 , 8) ) );
_mm256_storeu_si256(out + 11, _mm256_and_si256 ( mask, _mm256_srli_epi32( w1 , 12) ) );
_mm256_storeu_si256(out + 12, _mm256_and_si256 ( mask, _mm256_srli_epi32( w1 , 16) ) );
_mm256_storeu_si256(out + 13, _mm256_and_si256 ( mask, _mm256_srli_epi32( w1 , 20) ) );
_mm256_storeu_si256(out + 14, _mm256_and_si256 ( mask, _mm256_srli_epi32( w1 , 24) ) );
_mm256_storeu_si256(out + 15, _mm256_srli_epi32( w1 , 28) );
w0 = _mm256_lddqu_si256 (compressed + 2);
_mm256_storeu_si256(out + 16, _mm256_and_si256 ( mask, w0 ) );
_mm256_storeu_si256(out + 17, _mm256_and_si256 ( mask, _mm256_srli_epi32( w0 , 4) ) );
_mm256_storeu_si256(out + 18, _mm256_and_si256 ( mask, _mm256_srli_epi32( w0 , 8) ) );
_mm256_storeu_si256(out + 19, _mm256_and_si256 ( mask, _mm256_srli_epi32( w0 , 12) ) );
_mm256_storeu_si256(out + 20, _mm256_and_si256 ( mask, _mm256_srli_epi32( w0 , 16) ) );
_mm256_storeu_si256(out + 21, _mm256_and_si256 ( mask, _mm256_srli_epi32( w0 , 20) ) );
_mm256_storeu_si256(out + 22, _mm256_and_si256 ( mask, _mm256_srli_epi32( w0 , 24) ) );
_mm256_storeu_si256(out + 23, _mm256_srli_epi32( w0 , 28) );
w1 = _mm256_lddqu_si256 (compressed + 3);
_mm256_storeu_si256(out + 24, _mm256_and_si256 ( mask, w1 ) );
_mm256_storeu_si256(out + 25, _mm256_and_si256 ( mask, _mm256_srli_epi32( w1 , 4) ) );
_mm256_storeu_si256(out + 26, _mm256_and_si256 ( mask, _mm256_srli_epi32( w1 , 8) ) );
_mm256_storeu_si256(out + 27, _mm256_and_si256 ( mask, _mm256_srli_epi32( w1 , 12) ) );
_mm256_storeu_si256(out + 28, _mm256_and_si256 ( mask, _mm256_srli_epi32( w1 , 16) ) );
_mm256_storeu_si256(out + 29, _mm256_and_si256 ( mask, _mm256_srli_epi32( w1 , 20) ) );
_mm256_storeu_si256(out + 30, _mm256_and_si256 ( mask, _mm256_srli_epi32( w1 , 24) ) );
_mm256_storeu_si256(out + 31, _mm256_srli_epi32( w1 , 28) );
}
/* we packed 256 5-bit values, touching 5 256-bit words, using 80 bytes */
static void avxunpackblock5(const __m256i * compressed, uint32_t * pout) {
/* we are going to access 5 256-bit words */
__m256i w0, w1;
__m256i * out = (__m256i *) pout;
const __m256i mask = _mm256_set1_epi32(31);
w0 = _mm256_lddqu_si256 (compressed);
_mm256_storeu_si256(out + 0, _mm256_and_si256 ( mask, w0 ) );
_mm256_storeu_si256(out + 1, _mm256_and_si256 ( mask, _mm256_srli_epi32( w0 , 5) ) );
_mm256_storeu_si256(out + 2, _mm256_and_si256 ( mask, _mm256_srli_epi32( w0 , 10) ) );
_mm256_storeu_si256(out + 3, _mm256_and_si256 ( mask, _mm256_srli_epi32( w0 , 15) ) );
_mm256_storeu_si256(out + 4, _mm256_and_si256 ( mask, _mm256_srli_epi32( w0 , 20) ) );
_mm256_storeu_si256(out + 5, _mm256_and_si256 ( mask, _mm256_srli_epi32( w0 , 25) ) );
w1 = _mm256_lddqu_si256 (compressed + 1);
_mm256_storeu_si256(out + 6,
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w0 , 30) ,_mm256_slli_epi32( w1 , 2 ) ) ) );
_mm256_storeu_si256(out + 7, _mm256_and_si256 ( mask, _mm256_srli_epi32( w1 , 3) ) );
_mm256_storeu_si256(out + 8, _mm256_and_si256 ( mask, _mm256_srli_epi32( w1 , 8) ) );
_mm256_storeu_si256(out + 9, _mm256_and_si256 ( mask, _mm256_srli_epi32( w1 , 13) ) );
_mm256_storeu_si256(out + 10, _mm256_and_si256 ( mask, _mm256_srli_epi32( w1 , 18) ) );
_mm256_storeu_si256(out + 11, _mm256_and_si256 ( mask, _mm256_srli_epi32( w1 , 23) ) );
w0 = _mm256_lddqu_si256 (compressed + 2);
_mm256_storeu_si256(out + 12,
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w1 , 28) ,_mm256_slli_epi32( w0 , 4 ) ) ) );
_mm256_storeu_si256(out + 13, _mm256_and_si256 ( mask, _mm256_srli_epi32( w0 , 1) ) );
_mm256_storeu_si256(out + 14, _mm256_and_si256 ( mask, _mm256_srli_epi32( w0 , 6) ) );
_mm256_storeu_si256(out + 15, _mm256_and_si256 ( mask, _mm256_srli_epi32( w0 , 11) ) );
_mm256_storeu_si256(out + 16, _mm256_and_si256 ( mask, _mm256_srli_epi32( w0 , 16) ) );
_mm256_storeu_si256(out + 17, _mm256_and_si256 ( mask, _mm256_srli_epi32( w0 , 21) ) );
_mm256_storeu_si256(out + 18, _mm256_and_si256 ( mask, _mm256_srli_epi32( w0 , 26) ) );
w1 = _mm256_lddqu_si256 (compressed + 3);
_mm256_storeu_si256(out + 19,
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w0 , 31) ,_mm256_slli_epi32( w1 , 1 ) ) ) );
_mm256_storeu_si256(out + 20, _mm256_and_si256 ( mask, _mm256_srli_epi32( w1 , 4) ) );
_mm256_storeu_si256(out + 21, _mm256_and_si256 ( mask, _mm256_srli_epi32( w1 , 9) ) );
_mm256_storeu_si256(out + 22, _mm256_and_si256 ( mask, _mm256_srli_epi32( w1 , 14) ) );
_mm256_storeu_si256(out + 23, _mm256_and_si256 ( mask, _mm256_srli_epi32( w1 , 19) ) );
_mm256_storeu_si256(out + 24, _mm256_and_si256 ( mask, _mm256_srli_epi32( w1 , 24) ) );
w0 = _mm256_lddqu_si256 (compressed + 4);
_mm256_storeu_si256(out + 25,
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w1 , 29) ,_mm256_slli_epi32( w0 , 3 ) ) ) );
_mm256_storeu_si256(out + 26, _mm256_and_si256 ( mask, _mm256_srli_epi32( w0 , 2) ) );
_mm256_storeu_si256(out + 27, _mm256_and_si256 ( mask, _mm256_srli_epi32( w0 , 7) ) );
_mm256_storeu_si256(out + 28, _mm256_and_si256 ( mask, _mm256_srli_epi32( w0 , 12) ) );
_mm256_storeu_si256(out + 29, _mm256_and_si256 ( mask, _mm256_srli_epi32( w0 , 17) ) );
_mm256_storeu_si256(out + 30, _mm256_and_si256 ( mask, _mm256_srli_epi32( w0 , 22) ) );
_mm256_storeu_si256(out + 31, _mm256_srli_epi32( w0 , 27) );
}
/* we packed 256 6-bit values, touching 6 256-bit words, using 96 bytes */
static void avxunpackblock6(const __m256i * compressed, uint32_t * pout) {
/* we are going to access 6 256-bit words */
__m256i w0, w1;
__m256i * out = (__m256i *) pout;
const __m256i mask = _mm256_set1_epi32(63);
w0 = _mm256_lddqu_si256 (compressed);
_mm256_storeu_si256(out + 0, _mm256_and_si256 ( mask, w0 ) );
_mm256_storeu_si256(out + 1, _mm256_and_si256 ( mask, _mm256_srli_epi32( w0 , 6) ) );
_mm256_storeu_si256(out + 2, _mm256_and_si256 ( mask, _mm256_srli_epi32( w0 , 12) ) );
_mm256_storeu_si256(out + 3, _mm256_and_si256 ( mask, _mm256_srli_epi32( w0 , 18) ) );
_mm256_storeu_si256(out + 4, _mm256_and_si256 ( mask, _mm256_srli_epi32( w0 , 24) ) );
w1 = _mm256_lddqu_si256 (compressed + 1);
_mm256_storeu_si256(out + 5,
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w0 , 30) ,_mm256_slli_epi32( w1 , 2 ) ) ) );
_mm256_storeu_si256(out + 6, _mm256_and_si256 ( mask, _mm256_srli_epi32( w1 , 4) ) );
_mm256_storeu_si256(out + 7, _mm256_and_si256 ( mask, _mm256_srli_epi32( w1 , 10) ) );
_mm256_storeu_si256(out + 8, _mm256_and_si256 ( mask, _mm256_srli_epi32( w1 , 16) ) );
_mm256_storeu_si256(out + 9, _mm256_and_si256 ( mask, _mm256_srli_epi32( w1 , 22) ) );
w0 = _mm256_lddqu_si256 (compressed + 2);
_mm256_storeu_si256(out + 10,
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w1 , 28) ,_mm256_slli_epi32( w0 , 4 ) ) ) );
_mm256_storeu_si256(out + 11, _mm256_and_si256 ( mask, _mm256_srli_epi32( w0 , 2) ) );
_mm256_storeu_si256(out + 12, _mm256_and_si256 ( mask, _mm256_srli_epi32( w0 , 8) ) );
_mm256_storeu_si256(out + 13, _mm256_and_si256 ( mask, _mm256_srli_epi32( w0 , 14) ) );
_mm256_storeu_si256(out + 14, _mm256_and_si256 ( mask, _mm256_srli_epi32( w0 , 20) ) );
_mm256_storeu_si256(out + 15, _mm256_srli_epi32( w0 , 26) );
w1 = _mm256_lddqu_si256 (compressed + 3);
_mm256_storeu_si256(out + 16, _mm256_and_si256 ( mask, w1 ) );
_mm256_storeu_si256(out + 17, _mm256_and_si256 ( mask, _mm256_srli_epi32( w1 , 6) ) );
_mm256_storeu_si256(out + 18, _mm256_and_si256 ( mask, _mm256_srli_epi32( w1 , 12) ) );
_mm256_storeu_si256(out + 19, _mm256_and_si256 ( mask, _mm256_srli_epi32( w1 , 18) ) );
_mm256_storeu_si256(out + 20, _mm256_and_si256 ( mask, _mm256_srli_epi32( w1 , 24) ) );
w0 = _mm256_lddqu_si256 (compressed + 4);
_mm256_storeu_si256(out + 21,
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w1 , 30) ,_mm256_slli_epi32( w0 , 2 ) ) ) );
_mm256_storeu_si256(out + 22, _mm256_and_si256 ( mask, _mm256_srli_epi32( w0 , 4) ) );
_mm256_storeu_si256(out + 23, _mm256_and_si256 ( mask, _mm256_srli_epi32( w0 , 10) ) );
_mm256_storeu_si256(out + 24, _mm256_and_si256 ( mask, _mm256_srli_epi32( w0 , 16) ) );
_mm256_storeu_si256(out + 25, _mm256_and_si256 ( mask, _mm256_srli_epi32( w0 , 22) ) );
w1 = _mm256_lddqu_si256 (compressed + 5);
_mm256_storeu_si256(out + 26,
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w0 , 28) ,_mm256_slli_epi32( w1 , 4 ) ) ) );
_mm256_storeu_si256(out + 27, _mm256_and_si256 ( mask, _mm256_srli_epi32( w1 , 2) ) );
_mm256_storeu_si256(out + 28, _mm256_and_si256 ( mask, _mm256_srli_epi32( w1 , 8) ) );
_mm256_storeu_si256(out + 29, _mm256_and_si256 ( mask, _mm256_srli_epi32( w1 , 14) ) );
_mm256_storeu_si256(out + 30, _mm256_and_si256 ( mask, _mm256_srli_epi32( w1 , 20) ) );
_mm256_storeu_si256(out + 31, _mm256_srli_epi32( w1 , 26) );
}
/* we packed 256 7-bit values, touching 7 256-bit words, using 112 bytes */
static void avxunpackblock7(const __m256i * compressed, uint32_t * pout) {
/* we are going to access 7 256-bit words */
__m256i w0, w1;
__m256i * out = (__m256i *) pout;
const __m256i mask = _mm256_set1_epi32(127);
w0 = _mm256_lddqu_si256 (compressed);
_mm256_storeu_si256(out + 0, _mm256_and_si256 ( mask, w0 ) );
_mm256_storeu_si256(out + 1, _mm256_and_si256 ( mask, _mm256_srli_epi32( w0 , 7) ) );
_mm256_storeu_si256(out + 2, _mm256_and_si256 ( mask, _mm256_srli_epi32( w0 , 14) ) );
_mm256_storeu_si256(out + 3, _mm256_and_si256 ( mask, _mm256_srli_epi32( w0 , 21) ) );
w1 = _mm256_lddqu_si256 (compressed + 1);
_mm256_storeu_si256(out + 4,
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w0 , 28) ,_mm256_slli_epi32( w1 , 4 ) ) ) );
_mm256_storeu_si256(out + 5, _mm256_and_si256 ( mask, _mm256_srli_epi32( w1 , 3) ) );
_mm256_storeu_si256(out + 6, _mm256_and_si256 ( mask, _mm256_srli_epi32( w1 , 10) ) );
_mm256_storeu_si256(out + 7, _mm256_and_si256 ( mask, _mm256_srli_epi32( w1 , 17) ) );
_mm256_storeu_si256(out + 8, _mm256_and_si256 ( mask, _mm256_srli_epi32( w1 , 24) ) );
w0 = _mm256_lddqu_si256 (compressed + 2);
_mm256_storeu_si256(out + 9,
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w1 , 31) ,_mm256_slli_epi32( w0 , 1 ) ) ) );
_mm256_storeu_si256(out + 10, _mm256_and_si256 ( mask, _mm256_srli_epi32( w0 , 6) ) );
_mm256_storeu_si256(out + 11, _mm256_and_si256 ( mask, _mm256_srli_epi32( w0 , 13) ) );
_mm256_storeu_si256(out + 12, _mm256_and_si256 ( mask, _mm256_srli_epi32( w0 , 20) ) );
w1 = _mm256_lddqu_si256 (compressed + 3);
_mm256_storeu_si256(out + 13,
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w0 , 27) ,_mm256_slli_epi32( w1 , 5 ) ) ) );
_mm256_storeu_si256(out + 14, _mm256_and_si256 ( mask, _mm256_srli_epi32( w1 , 2) ) );
_mm256_storeu_si256(out + 15, _mm256_and_si256 ( mask, _mm256_srli_epi32( w1 , 9) ) );
_mm256_storeu_si256(out + 16, _mm256_and_si256 ( mask, _mm256_srli_epi32( w1 , 16) ) );
_mm256_storeu_si256(out + 17, _mm256_and_si256 ( mask, _mm256_srli_epi32( w1 , 23) ) );
w0 = _mm256_lddqu_si256 (compressed + 4);
_mm256_storeu_si256(out + 18,
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w1 , 30) ,_mm256_slli_epi32( w0 , 2 ) ) ) );
_mm256_storeu_si256(out + 19, _mm256_and_si256 ( mask, _mm256_srli_epi32( w0 , 5) ) );
_mm256_storeu_si256(out + 20, _mm256_and_si256 ( mask, _mm256_srli_epi32( w0 , 12) ) );
_mm256_storeu_si256(out + 21, _mm256_and_si256 ( mask, _mm256_srli_epi32( w0 , 19) ) );
w1 = _mm256_lddqu_si256 (compressed + 5);
_mm256_storeu_si256(out + 22,
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w0 , 26) ,_mm256_slli_epi32( w1 , 6 ) ) ) );
_mm256_storeu_si256(out + 23, _mm256_and_si256 ( mask, _mm256_srli_epi32( w1 , 1) ) );
_mm256_storeu_si256(out + 24, _mm256_and_si256 ( mask, _mm256_srli_epi32( w1 , 8) ) );
_mm256_storeu_si256(out + 25, _mm256_and_si256 ( mask, _mm256_srli_epi32( w1 , 15) ) );
_mm256_storeu_si256(out + 26, _mm256_and_si256 ( mask, _mm256_srli_epi32( w1 , 22) ) );
w0 = _mm256_lddqu_si256 (compressed + 6);
_mm256_storeu_si256(out + 27,
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w1 , 29) ,_mm256_slli_epi32( w0 , 3 ) ) ) );
_mm256_storeu_si256(out + 28, _mm256_and_si256 ( mask, _mm256_srli_epi32( w0 , 4) ) );
_mm256_storeu_si256(out + 29, _mm256_and_si256 ( mask, _mm256_srli_epi32( w0 , 11) ) );
_mm256_storeu_si256(out + 30, _mm256_and_si256 ( mask, _mm256_srli_epi32( w0 , 18) ) );
_mm256_storeu_si256(out + 31, _mm256_srli_epi32( w0 , 25) );
}
/* we packed 256 8-bit values, touching 8 256-bit words, using 128 bytes */
static void avxunpackblock8(const __m256i * compressed, uint32_t * pout) {
/* we are going to access 8 256-bit words */
__m256i w0, w1;
__m256i * out = (__m256i *) pout;
const __m256i mask = _mm256_set1_epi32(255);
w0 = _mm256_lddqu_si256 (compressed);
_mm256_storeu_si256(out + 0, _mm256_and_si256 ( mask, w0 ) );
_mm256_storeu_si256(out + 1, _mm256_and_si256 ( mask, _mm256_srli_epi32( w0 , 8) ) );
_mm256_storeu_si256(out + 2, _mm256_and_si256 ( mask, _mm256_srli_epi32( w0 , 16) ) );
_mm256_storeu_si256(out + 3, _mm256_srli_epi32( w0 , 24) );
w1 = _mm256_lddqu_si256 (compressed + 1);
_mm256_storeu_si256(out + 4, _mm256_and_si256 ( mask, w1 ) );
_mm256_storeu_si256(out + 5, _mm256_and_si256 ( mask, _mm256_srli_epi32( w1 , 8) ) );
_mm256_storeu_si256(out + 6, _mm256_and_si256 ( mask, _mm256_srli_epi32( w1 , 16) ) );
_mm256_storeu_si256(out + 7, _mm256_srli_epi32( w1 , 24) );
w0 = _mm256_lddqu_si256 (compressed + 2);
_mm256_storeu_si256(out + 8, _mm256_and_si256 ( mask, w0 ) );
_mm256_storeu_si256(out + 9, _mm256_and_si256 ( mask, _mm256_srli_epi32( w0 , 8) ) );
_mm256_storeu_si256(out + 10, _mm256_and_si256 ( mask, _mm256_srli_epi32( w0 , 16) ) );
_mm256_storeu_si256(out + 11, _mm256_srli_epi32( w0 , 24) );
w1 = _mm256_lddqu_si256 (compressed + 3);
_mm256_storeu_si256(out + 12, _mm256_and_si256 ( mask, w1 ) );
_mm256_storeu_si256(out + 13, _mm256_and_si256 ( mask, _mm256_srli_epi32( w1 , 8) ) );
_mm256_storeu_si256(out + 14, _mm256_and_si256 ( mask, _mm256_srli_epi32( w1 , 16) ) );
_mm256_storeu_si256(out + 15, _mm256_srli_epi32( w1 , 24) );
w0 = _mm256_lddqu_si256 (compressed + 4);
_mm256_storeu_si256(out + 16, _mm256_and_si256 ( mask, w0 ) );
_mm256_storeu_si256(out + 17, _mm256_and_si256 ( mask, _mm256_srli_epi32( w0 , 8) ) );
_mm256_storeu_si256(out + 18, _mm256_and_si256 ( mask, _mm256_srli_epi32( w0 , 16) ) );
_mm256_storeu_si256(out + 19, _mm256_srli_epi32( w0 , 24) );
w1 = _mm256_lddqu_si256 (compressed + 5);
_mm256_storeu_si256(out + 20, _mm256_and_si256 ( mask, w1 ) );
_mm256_storeu_si256(out + 21, _mm256_and_si256 ( mask, _mm256_srli_epi32( w1 , 8) ) );
_mm256_storeu_si256(out + 22, _mm256_and_si256 ( mask, _mm256_srli_epi32( w1 , 16) ) );
_mm256_storeu_si256(out + 23, _mm256_srli_epi32( w1 , 24) );
w0 = _mm256_lddqu_si256 (compressed + 6);
_mm256_storeu_si256(out + 24, _mm256_and_si256 ( mask, w0 ) );
_mm256_storeu_si256(out + 25, _mm256_and_si256 ( mask, _mm256_srli_epi32( w0 , 8) ) );
_mm256_storeu_si256(out + 26, _mm256_and_si256 ( mask, _mm256_srli_epi32( w0 , 16) ) );
_mm256_storeu_si256(out + 27, _mm256_srli_epi32( w0 , 24) );
w1 = _mm256_lddqu_si256 (compressed + 7);
_mm256_storeu_si256(out + 28, _mm256_and_si256 ( mask, w1 ) );
_mm256_storeu_si256(out + 29, _mm256_and_si256 ( mask, _mm256_srli_epi32( w1 , 8) ) );
_mm256_storeu_si256(out + 30, _mm256_and_si256 ( mask, _mm256_srli_epi32( w1 , 16) ) );
_mm256_storeu_si256(out + 31, _mm256_srli_epi32( w1 , 24) );
}
/* we packed 256 9-bit values, touching 9 256-bit words, using 144 bytes */
static void avxunpackblock9(const __m256i * compressed, uint32_t * pout) {
/* we are going to access 9 256-bit words */
__m256i w0, w1;
__m256i * out = (__m256i *) pout;
const __m256i mask = _mm256_set1_epi32(511);
w0 = _mm256_lddqu_si256 (compressed);
_mm256_storeu_si256(out + 0, _mm256_and_si256 ( mask, w0 ) );
_mm256_storeu_si256(out + 1, _mm256_and_si256 ( mask, _mm256_srli_epi32( w0 , 9) ) );
_mm256_storeu_si256(out + 2, _mm256_and_si256 ( mask, _mm256_srli_epi32( w0 , 18) ) );
w1 = _mm256_lddqu_si256 (compressed + 1);
_mm256_storeu_si256(out + 3,
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w0 , 27) ,_mm256_slli_epi32( w1 , 5 ) ) ) );
_mm256_storeu_si256(out + 4, _mm256_and_si256 ( mask, _mm256_srli_epi32( w1 , 4) ) );
_mm256_storeu_si256(out + 5, _mm256_and_si256 ( mask, _mm256_srli_epi32( w1 , 13) ) );
_mm256_storeu_si256(out + 6, _mm256_and_si256 ( mask, _mm256_srli_epi32( w1 , 22) ) );
w0 = _mm256_lddqu_si256 (compressed + 2);
_mm256_storeu_si256(out + 7,
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w1 , 31) ,_mm256_slli_epi32( w0 , 1 ) ) ) );
_mm256_storeu_si256(out + 8, _mm256_and_si256 ( mask, _mm256_srli_epi32( w0 , 8) ) );
_mm256_storeu_si256(out + 9, _mm256_and_si256 ( mask, _mm256_srli_epi32( w0 , 17) ) );
w1 = _mm256_lddqu_si256 (compressed + 3);
_mm256_storeu_si256(out + 10,
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w0 , 26) ,_mm256_slli_epi32( w1 , 6 ) ) ) );
_mm256_storeu_si256(out + 11, _mm256_and_si256 ( mask, _mm256_srli_epi32( w1 , 3) ) );
_mm256_storeu_si256(out + 12, _mm256_and_si256 ( mask, _mm256_srli_epi32( w1 , 12) ) );
_mm256_storeu_si256(out + 13, _mm256_and_si256 ( mask, _mm256_srli_epi32( w1 , 21) ) );
w0 = _mm256_lddqu_si256 (compressed + 4);
_mm256_storeu_si256(out + 14,
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w1 , 30) ,_mm256_slli_epi32( w0 , 2 ) ) ) );
_mm256_storeu_si256(out + 15, _mm256_and_si256 ( mask, _mm256_srli_epi32( w0 , 7) ) );
_mm256_storeu_si256(out + 16, _mm256_and_si256 ( mask, _mm256_srli_epi32( w0 , 16) ) );
w1 = _mm256_lddqu_si256 (compressed + 5);
_mm256_storeu_si256(out + 17,
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w0 , 25) ,_mm256_slli_epi32( w1 , 7 ) ) ) );
_mm256_storeu_si256(out + 18, _mm256_and_si256 ( mask, _mm256_srli_epi32( w1 , 2) ) );
_mm256_storeu_si256(out + 19, _mm256_and_si256 ( mask, _mm256_srli_epi32( w1 , 11) ) );
_mm256_storeu_si256(out + 20, _mm256_and_si256 ( mask, _mm256_srli_epi32( w1 , 20) ) );
w0 = _mm256_lddqu_si256 (compressed + 6);
_mm256_storeu_si256(out + 21,
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w1 , 29) ,_mm256_slli_epi32( w0 , 3 ) ) ) );
_mm256_storeu_si256(out + 22, _mm256_and_si256 ( mask, _mm256_srli_epi32( w0 , 6) ) );
_mm256_storeu_si256(out + 23, _mm256_and_si256 ( mask, _mm256_srli_epi32( w0 , 15) ) );
w1 = _mm256_lddqu_si256 (compressed + 7);
_mm256_storeu_si256(out + 24,
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w0 , 24) ,_mm256_slli_epi32( w1 , 8 ) ) ) );
_mm256_storeu_si256(out + 25, _mm256_and_si256 ( mask, _mm256_srli_epi32( w1 , 1) ) );
_mm256_storeu_si256(out + 26, _mm256_and_si256 ( mask, _mm256_srli_epi32( w1 , 10) ) );
_mm256_storeu_si256(out + 27, _mm256_and_si256 ( mask, _mm256_srli_epi32( w1 , 19) ) );
w0 = _mm256_lddqu_si256 (compressed + 8);
_mm256_storeu_si256(out + 28,
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w1 , 28) ,_mm256_slli_epi32( w0 , 4 ) ) ) );
_mm256_storeu_si256(out + 29, _mm256_and_si256 ( mask, _mm256_srli_epi32( w0 , 5) ) );
_mm256_storeu_si256(out + 30, _mm256_and_si256 ( mask, _mm256_srli_epi32( w0 , 14) ) );
_mm256_storeu_si256(out + 31, _mm256_srli_epi32( w0 , 23) );
}
/* we packed 256 10-bit values, touching 10 256-bit words, using 160 bytes */
static void avxunpackblock10(const __m256i * compressed, uint32_t * pout) {
/* we are going to access 10 256-bit words */
__m256i w0, w1;
__m256i * out = (__m256i *) pout;
const __m256i mask = _mm256_set1_epi32(1023);
w0 = _mm256_lddqu_si256 (compressed);
_mm256_storeu_si256(out + 0, _mm256_and_si256 ( mask, w0 ) );
_mm256_storeu_si256(out + 1, _mm256_and_si256 ( mask, _mm256_srli_epi32( w0 , 10) ) );
_mm256_storeu_si256(out + 2, _mm256_and_si256 ( mask, _mm256_srli_epi32( w0 , 20) ) );
w1 = _mm256_lddqu_si256 (compressed + 1);
_mm256_storeu_si256(out + 3,
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w0 , 30) ,_mm256_slli_epi32( w1 , 2 ) ) ) );
_mm256_storeu_si256(out + 4, _mm256_and_si256 ( mask, _mm256_srli_epi32( w1 , 8) ) );
_mm256_storeu_si256(out + 5, _mm256_and_si256 ( mask, _mm256_srli_epi32( w1 , 18) ) );
w0 = _mm256_lddqu_si256 (compressed + 2);
_mm256_storeu_si256(out + 6,
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w1 , 28) ,_mm256_slli_epi32( w0 , 4 ) ) ) );
_mm256_storeu_si256(out + 7, _mm256_and_si256 ( mask, _mm256_srli_epi32( w0 , 6) ) );
_mm256_storeu_si256(out + 8, _mm256_and_si256 ( mask, _mm256_srli_epi32( w0 , 16) ) );
w1 = _mm256_lddqu_si256 (compressed + 3);
_mm256_storeu_si256(out + 9,
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w0 , 26) ,_mm256_slli_epi32( w1 , 6 ) ) ) );
_mm256_storeu_si256(out + 10, _mm256_and_si256 ( mask, _mm256_srli_epi32( w1 , 4) ) );
_mm256_storeu_si256(out + 11, _mm256_and_si256 ( mask, _mm256_srli_epi32( w1 , 14) ) );
w0 = _mm256_lddqu_si256 (compressed + 4);
_mm256_storeu_si256(out + 12,
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w1 , 24) ,_mm256_slli_epi32( w0 , 8 ) ) ) );
_mm256_storeu_si256(out + 13, _mm256_and_si256 ( mask, _mm256_srli_epi32( w0 , 2) ) );
_mm256_storeu_si256(out + 14, _mm256_and_si256 ( mask, _mm256_srli_epi32( w0 , 12) ) );
_mm256_storeu_si256(out + 15, _mm256_srli_epi32( w0 , 22) );
w1 = _mm256_lddqu_si256 (compressed + 5);
_mm256_storeu_si256(out + 16, _mm256_and_si256 ( mask, w1 ) );
_mm256_storeu_si256(out + 17, _mm256_and_si256 ( mask, _mm256_srli_epi32( w1 , 10) ) );
_mm256_storeu_si256(out + 18, _mm256_and_si256 ( mask, _mm256_srli_epi32( w1 , 20) ) );
w0 = _mm256_lddqu_si256 (compressed + 6);
_mm256_storeu_si256(out + 19,
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w1 , 30) ,_mm256_slli_epi32( w0 , 2 ) ) ) );
_mm256_storeu_si256(out + 20, _mm256_and_si256 ( mask, _mm256_srli_epi32( w0 , 8) ) );
_mm256_storeu_si256(out + 21, _mm256_and_si256 ( mask, _mm256_srli_epi32( w0 , 18) ) );
w1 = _mm256_lddqu_si256 (compressed + 7);
_mm256_storeu_si256(out + 22,
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w0 , 28) ,_mm256_slli_epi32( w1 , 4 ) ) ) );
_mm256_storeu_si256(out + 23, _mm256_and_si256 ( mask, _mm256_srli_epi32( w1 , 6) ) );
_mm256_storeu_si256(out + 24, _mm256_and_si256 ( mask, _mm256_srli_epi32( w1 , 16) ) );
w0 = _mm256_lddqu_si256 (compressed + 8);
_mm256_storeu_si256(out + 25,
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w1 , 26) ,_mm256_slli_epi32( w0 , 6 ) ) ) );
_mm256_storeu_si256(out + 26, _mm256_and_si256 ( mask, _mm256_srli_epi32( w0 , 4) ) );
_mm256_storeu_si256(out + 27, _mm256_and_si256 ( mask, _mm256_srli_epi32( w0 , 14) ) );
w1 = _mm256_lddqu_si256 (compressed + 9);
_mm256_storeu_si256(out + 28,
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w0 , 24) ,_mm256_slli_epi32( w1 , 8 ) ) ) );
_mm256_storeu_si256(out + 29, _mm256_and_si256 ( mask, _mm256_srli_epi32( w1 , 2) ) );
_mm256_storeu_si256(out + 30, _mm256_and_si256 ( mask, _mm256_srli_epi32( w1 , 12) ) );
_mm256_storeu_si256(out + 31, _mm256_srli_epi32( w1 , 22) );
}
/* we packed 256 11-bit values, touching 11 256-bit words, using 176 bytes */
static void avxunpackblock11(const __m256i * compressed, uint32_t * pout) {
/* we are going to access 11 256-bit words */
__m256i w0, w1;
__m256i * out = (__m256i *) pout;
const __m256i mask = _mm256_set1_epi32(2047);
w0 = _mm256_lddqu_si256 (compressed);
_mm256_storeu_si256(out + 0, _mm256_and_si256 ( mask, w0 ) );
_mm256_storeu_si256(out + 1, _mm256_and_si256 ( mask, _mm256_srli_epi32( w0 , 11) ) );
w1 = _mm256_lddqu_si256 (compressed + 1);
_mm256_storeu_si256(out + 2,
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w0 , 22) ,_mm256_slli_epi32( w1 , 10 ) ) ) );
_mm256_storeu_si256(out + 3, _mm256_and_si256 ( mask, _mm256_srli_epi32( w1 , 1) ) );
_mm256_storeu_si256(out + 4, _mm256_and_si256 ( mask, _mm256_srli_epi32( w1 , 12) ) );
w0 = _mm256_lddqu_si256 (compressed + 2);
_mm256_storeu_si256(out + 5,
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w1 , 23) ,_mm256_slli_epi32( w0 , 9 ) ) ) );
_mm256_storeu_si256(out + 6, _mm256_and_si256 ( mask, _mm256_srli_epi32( w0 , 2) ) );
_mm256_storeu_si256(out + 7, _mm256_and_si256 ( mask, _mm256_srli_epi32( w0 , 13) ) );
w1 = _mm256_lddqu_si256 (compressed + 3);
_mm256_storeu_si256(out + 8,
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w0 , 24) ,_mm256_slli_epi32( w1 , 8 ) ) ) );
_mm256_storeu_si256(out + 9, _mm256_and_si256 ( mask, _mm256_srli_epi32( w1 , 3) ) );
_mm256_storeu_si256(out + 10, _mm256_and_si256 ( mask, _mm256_srli_epi32( w1 , 14) ) );
w0 = _mm256_lddqu_si256 (compressed + 4);
_mm256_storeu_si256(out + 11,
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w1 , 25) ,_mm256_slli_epi32( w0 , 7 ) ) ) );
_mm256_storeu_si256(out + 12, _mm256_and_si256 ( mask, _mm256_srli_epi32( w0 , 4) ) );
_mm256_storeu_si256(out + 13, _mm256_and_si256 ( mask, _mm256_srli_epi32( w0 , 15) ) );
w1 = _mm256_lddqu_si256 (compressed + 5);
_mm256_storeu_si256(out + 14,
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w0 , 26) ,_mm256_slli_epi32( w1 , 6 ) ) ) );
_mm256_storeu_si256(out + 15, _mm256_and_si256 ( mask, _mm256_srli_epi32( w1 , 5) ) );
_mm256_storeu_si256(out + 16, _mm256_and_si256 ( mask, _mm256_srli_epi32( w1 , 16) ) );
w0 = _mm256_lddqu_si256 (compressed + 6);
_mm256_storeu_si256(out + 17,
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w1 , 27) ,_mm256_slli_epi32( w0 , 5 ) ) ) );
_mm256_storeu_si256(out + 18, _mm256_and_si256 ( mask, _mm256_srli_epi32( w0 , 6) ) );
_mm256_storeu_si256(out + 19, _mm256_and_si256 ( mask, _mm256_srli_epi32( w0 , 17) ) );
w1 = _mm256_lddqu_si256 (compressed + 7);
_mm256_storeu_si256(out + 20,
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w0 , 28) ,_mm256_slli_epi32( w1 , 4 ) ) ) );
_mm256_storeu_si256(out + 21, _mm256_and_si256 ( mask, _mm256_srli_epi32( w1 , 7) ) );
_mm256_storeu_si256(out + 22, _mm256_and_si256 ( mask, _mm256_srli_epi32( w1 , 18) ) );
w0 = _mm256_lddqu_si256 (compressed + 8);
_mm256_storeu_si256(out + 23,
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w1 , 29) ,_mm256_slli_epi32( w0 , 3 ) ) ) );
_mm256_storeu_si256(out + 24, _mm256_and_si256 ( mask, _mm256_srli_epi32( w0 , 8) ) );
_mm256_storeu_si256(out + 25, _mm256_and_si256 ( mask, _mm256_srli_epi32( w0 , 19) ) );
w1 = _mm256_lddqu_si256 (compressed + 9);
_mm256_storeu_si256(out + 26,
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w0 , 30) ,_mm256_slli_epi32( w1 , 2 ) ) ) );
_mm256_storeu_si256(out + 27, _mm256_and_si256 ( mask, _mm256_srli_epi32( w1 , 9) ) );
_mm256_storeu_si256(out + 28, _mm256_and_si256 ( mask, _mm256_srli_epi32( w1 , 20) ) );
w0 = _mm256_lddqu_si256 (compressed + 10);
_mm256_storeu_si256(out + 29,
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w1 , 31) ,_mm256_slli_epi32( w0 , 1 ) ) ) );
_mm256_storeu_si256(out + 30, _mm256_and_si256 ( mask, _mm256_srli_epi32( w0 , 10) ) );
_mm256_storeu_si256(out + 31, _mm256_srli_epi32( w0 , 21) );
}
/* we packed 256 12-bit values, touching 12 256-bit words, using 192 bytes */
static void avxunpackblock12(const __m256i * compressed, uint32_t * pout) {
/* we are going to access 12 256-bit words */
__m256i w0, w1;
__m256i * out = (__m256i *) pout;
const __m256i mask = _mm256_set1_epi32(4095);
w0 = _mm256_lddqu_si256 (compressed);
_mm256_storeu_si256(out + 0, _mm256_and_si256 ( mask, w0 ) );
_mm256_storeu_si256(out + 1, _mm256_and_si256 ( mask, _mm256_srli_epi32( w0 , 12) ) );
w1 = _mm256_lddqu_si256 (compressed + 1);
_mm256_storeu_si256(out + 2,
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w0 , 24) ,_mm256_slli_epi32( w1 , 8 ) ) ) );
_mm256_storeu_si256(out + 3, _mm256_and_si256 ( mask, _mm256_srli_epi32( w1 , 4) ) );
_mm256_storeu_si256(out + 4, _mm256_and_si256 ( mask, _mm256_srli_epi32( w1 , 16) ) );
w0 = _mm256_lddqu_si256 (compressed + 2);
_mm256_storeu_si256(out + 5,
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w1 , 28) ,_mm256_slli_epi32( w0 , 4 ) ) ) );
_mm256_storeu_si256(out + 6, _mm256_and_si256 ( mask, _mm256_srli_epi32( w0 , 8) ) );
_mm256_storeu_si256(out + 7, _mm256_srli_epi32( w0 , 20) );
w1 = _mm256_lddqu_si256 (compressed + 3);
_mm256_storeu_si256(out + 8, _mm256_and_si256 ( mask, w1 ) );
_mm256_storeu_si256(out + 9, _mm256_and_si256 ( mask, _mm256_srli_epi32( w1 , 12) ) );
w0 = _mm256_lddqu_si256 (compressed + 4);
_mm256_storeu_si256(out + 10,
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w1 , 24) ,_mm256_slli_epi32( w0 , 8 ) ) ) );
_mm256_storeu_si256(out + 11, _mm256_and_si256 ( mask, _mm256_srli_epi32( w0 , 4) ) );
_mm256_storeu_si256(out + 12, _mm256_and_si256 ( mask, _mm256_srli_epi32( w0 , 16) ) );
w1 = _mm256_lddqu_si256 (compressed + 5);
_mm256_storeu_si256(out + 13,
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w0 , 28) ,_mm256_slli_epi32( w1 , 4 ) ) ) );
_mm256_storeu_si256(out + 14, _mm256_and_si256 ( mask, _mm256_srli_epi32( w1 , 8) ) );
_mm256_storeu_si256(out + 15, _mm256_srli_epi32( w1 , 20) );
w0 = _mm256_lddqu_si256 (compressed + 6);
_mm256_storeu_si256(out + 16, _mm256_and_si256 ( mask, w0 ) );
_mm256_storeu_si256(out + 17, _mm256_and_si256 ( mask, _mm256_srli_epi32( w0 , 12) ) );
w1 = _mm256_lddqu_si256 (compressed + 7);
_mm256_storeu_si256(out + 18,
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w0 , 24) ,_mm256_slli_epi32( w1 , 8 ) ) ) );
_mm256_storeu_si256(out + 19, _mm256_and_si256 ( mask, _mm256_srli_epi32( w1 , 4) ) );
_mm256_storeu_si256(out + 20, _mm256_and_si256 ( mask, _mm256_srli_epi32( w1 , 16) ) );
w0 = _mm256_lddqu_si256 (compressed + 8);
_mm256_storeu_si256(out + 21,
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w1 , 28) ,_mm256_slli_epi32( w0 , 4 ) ) ) );
_mm256_storeu_si256(out + 22, _mm256_and_si256 ( mask, _mm256_srli_epi32( w0 , 8) ) );
_mm256_storeu_si256(out + 23, _mm256_srli_epi32( w0 , 20) );
w1 = _mm256_lddqu_si256 (compressed + 9);
_mm256_storeu_si256(out + 24, _mm256_and_si256 ( mask, w1 ) );
_mm256_storeu_si256(out + 25, _mm256_and_si256 ( mask, _mm256_srli_epi32( w1 , 12) ) );
w0 = _mm256_lddqu_si256 (compressed + 10);
_mm256_storeu_si256(out + 26,
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w1 , 24) ,_mm256_slli_epi32( w0 , 8 ) ) ) );
_mm256_storeu_si256(out + 27, _mm256_and_si256 ( mask, _mm256_srli_epi32( w0 , 4) ) );
_mm256_storeu_si256(out + 28, _mm256_and_si256 ( mask, _mm256_srli_epi32( w0 , 16) ) );
w1 = _mm256_lddqu_si256 (compressed + 11);
_mm256_storeu_si256(out + 29,
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w0 , 28) ,_mm256_slli_epi32( w1 , 4 ) ) ) );
_mm256_storeu_si256(out + 30, _mm256_and_si256 ( mask, _mm256_srli_epi32( w1 , 8) ) );
_mm256_storeu_si256(out + 31, _mm256_srli_epi32( w1 , 20) );
}
/* we packed 256 13-bit values, touching 13 256-bit words, using 208 bytes */
static void avxunpackblock13(const __m256i * compressed, uint32_t * pout) {
/* we are going to access 13 256-bit words */
__m256i w0, w1;
__m256i * out = (__m256i *) pout;
const __m256i mask = _mm256_set1_epi32(8191);
w0 = _mm256_lddqu_si256 (compressed);
_mm256_storeu_si256(out + 0, _mm256_and_si256 ( mask, w0 ) );
_mm256_storeu_si256(out + 1, _mm256_and_si256 ( mask, _mm256_srli_epi32( w0 , 13) ) );
w1 = _mm256_lddqu_si256 (compressed + 1);
_mm256_storeu_si256(out + 2,
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w0 , 26) ,_mm256_slli_epi32( w1 , 6 ) ) ) );
_mm256_storeu_si256(out + 3, _mm256_and_si256 ( mask, _mm256_srli_epi32( w1 , 7) ) );
w0 = _mm256_lddqu_si256 (compressed + 2);
_mm256_storeu_si256(out + 4,
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w1 , 20) ,_mm256_slli_epi32( w0 , 12 ) ) ) );
_mm256_storeu_si256(out + 5, _mm256_and_si256 ( mask, _mm256_srli_epi32( w0 , 1) ) );
_mm256_storeu_si256(out + 6, _mm256_and_si256 ( mask, _mm256_srli_epi32( w0 , 14) ) );
w1 = _mm256_lddqu_si256 (compressed + 3);
_mm256_storeu_si256(out + 7,
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w0 , 27) ,_mm256_slli_epi32( w1 , 5 ) ) ) );
_mm256_storeu_si256(out + 8, _mm256_and_si256 ( mask, _mm256_srli_epi32( w1 , 8) ) );
w0 = _mm256_lddqu_si256 (compressed + 4);
_mm256_storeu_si256(out + 9,
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w1 , 21) ,_mm256_slli_epi32( w0 , 11 ) ) ) );
_mm256_storeu_si256(out + 10, _mm256_and_si256 ( mask, _mm256_srli_epi32( w0 , 2) ) );
_mm256_storeu_si256(out + 11, _mm256_and_si256 ( mask, _mm256_srli_epi32( w0 , 15) ) );
w1 = _mm256_lddqu_si256 (compressed + 5);
_mm256_storeu_si256(out + 12,
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w0 , 28) ,_mm256_slli_epi32( w1 , 4 ) ) ) );
_mm256_storeu_si256(out + 13, _mm256_and_si256 ( mask, _mm256_srli_epi32( w1 , 9) ) );
w0 = _mm256_lddqu_si256 (compressed + 6);
_mm256_storeu_si256(out + 14,
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w1 , 22) ,_mm256_slli_epi32( w0 , 10 ) ) ) );
_mm256_storeu_si256(out + 15, _mm256_and_si256 ( mask, _mm256_srli_epi32( w0 , 3) ) );
_mm256_storeu_si256(out + 16, _mm256_and_si256 ( mask, _mm256_srli_epi32( w0 , 16) ) );
w1 = _mm256_lddqu_si256 (compressed + 7);
_mm256_storeu_si256(out + 17,
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w0 , 29) ,_mm256_slli_epi32( w1 , 3 ) ) ) );
_mm256_storeu_si256(out + 18, _mm256_and_si256 ( mask, _mm256_srli_epi32( w1 , 10) ) );
w0 = _mm256_lddqu_si256 (compressed + 8);
_mm256_storeu_si256(out + 19,
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w1 , 23) ,_mm256_slli_epi32( w0 , 9 ) ) ) );
_mm256_storeu_si256(out + 20, _mm256_and_si256 ( mask, _mm256_srli_epi32( w0 , 4) ) );
_mm256_storeu_si256(out + 21, _mm256_and_si256 ( mask, _mm256_srli_epi32( w0 , 17) ) );
w1 = _mm256_lddqu_si256 (compressed + 9);
_mm256_storeu_si256(out + 22,
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w0 , 30) ,_mm256_slli_epi32( w1 , 2 ) ) ) );
_mm256_storeu_si256(out + 23, _mm256_and_si256 ( mask, _mm256_srli_epi32( w1 , 11) ) );
w0 = _mm256_lddqu_si256 (compressed + 10);
_mm256_storeu_si256(out + 24,
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w1 , 24) ,_mm256_slli_epi32( w0 , 8 ) ) ) );
_mm256_storeu_si256(out + 25, _mm256_and_si256 ( mask, _mm256_srli_epi32( w0 , 5) ) );
_mm256_storeu_si256(out + 26, _mm256_and_si256 ( mask, _mm256_srli_epi32( w0 , 18) ) );
w1 = _mm256_lddqu_si256 (compressed + 11);
_mm256_storeu_si256(out + 27,
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w0 , 31) ,_mm256_slli_epi32( w1 , 1 ) ) ) );
_mm256_storeu_si256(out + 28, _mm256_and_si256 ( mask, _mm256_srli_epi32( w1 , 12) ) );
w0 = _mm256_lddqu_si256 (compressed + 12);
_mm256_storeu_si256(out + 29,
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w1 , 25) ,_mm256_slli_epi32( w0 , 7 ) ) ) );
_mm256_storeu_si256(out + 30, _mm256_and_si256 ( mask, _mm256_srli_epi32( w0 , 6) ) );
_mm256_storeu_si256(out + 31, _mm256_srli_epi32( w0 , 19) );
}
/* we packed 256 14-bit values, touching 14 256-bit words, using 224 bytes */
static void avxunpackblock14(const __m256i * compressed, uint32_t * pout) {
/* we are going to access 14 256-bit words */
__m256i w0, w1;
__m256i * out = (__m256i *) pout;
const __m256i mask = _mm256_set1_epi32(16383);
w0 = _mm256_lddqu_si256 (compressed);
_mm256_storeu_si256(out + 0, _mm256_and_si256 ( mask, w0 ) );
_mm256_storeu_si256(out + 1, _mm256_and_si256 ( mask, _mm256_srli_epi32( w0 , 14) ) );
w1 = _mm256_lddqu_si256 (compressed + 1);
_mm256_storeu_si256(out + 2,
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w0 , 28) ,_mm256_slli_epi32( w1 , 4 ) ) ) );
_mm256_storeu_si256(out + 3, _mm256_and_si256 ( mask, _mm256_srli_epi32( w1 , 10) ) );
w0 = _mm256_lddqu_si256 (compressed + 2);
_mm256_storeu_si256(out + 4,
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w1 , 24) ,_mm256_slli_epi32( w0 , 8 ) ) ) );
_mm256_storeu_si256(out + 5, _mm256_and_si256 ( mask, _mm256_srli_epi32( w0 , 6) ) );
w1 = _mm256_lddqu_si256 (compressed + 3);
_mm256_storeu_si256(out + 6,
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w0 , 20) ,_mm256_slli_epi32( w1 , 12 ) ) ) );
_mm256_storeu_si256(out + 7, _mm256_and_si256 ( mask, _mm256_srli_epi32( w1 , 2) ) );
_mm256_storeu_si256(out + 8, _mm256_and_si256 ( mask, _mm256_srli_epi32( w1 , 16) ) );
w0 = _mm256_lddqu_si256 (compressed + 4);
_mm256_storeu_si256(out + 9,
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w1 , 30) ,_mm256_slli_epi32( w0 , 2 ) ) ) );
_mm256_storeu_si256(out + 10, _mm256_and_si256 ( mask, _mm256_srli_epi32( w0 , 12) ) );
w1 = _mm256_lddqu_si256 (compressed + 5);
_mm256_storeu_si256(out + 11,
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w0 , 26) ,_mm256_slli_epi32( w1 , 6 ) ) ) );
_mm256_storeu_si256(out + 12, _mm256_and_si256 ( mask, _mm256_srli_epi32( w1 , 8) ) );
w0 = _mm256_lddqu_si256 (compressed + 6);
_mm256_storeu_si256(out + 13,
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w1 , 22) ,_mm256_slli_epi32( w0 , 10 ) ) ) );
_mm256_storeu_si256(out + 14, _mm256_and_si256 ( mask, _mm256_srli_epi32( w0 , 4) ) );
_mm256_storeu_si256(out + 15, _mm256_srli_epi32( w0 , 18) );
w1 = _mm256_lddqu_si256 (compressed + 7);
_mm256_storeu_si256(out + 16, _mm256_and_si256 ( mask, w1 ) );
_mm256_storeu_si256(out + 17, _mm256_and_si256 ( mask, _mm256_srli_epi32( w1 , 14) ) );
w0 = _mm256_lddqu_si256 (compressed + 8);
_mm256_storeu_si256(out + 18,
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w1 , 28) ,_mm256_slli_epi32( w0 , 4 ) ) ) );
_mm256_storeu_si256(out + 19, _mm256_and_si256 ( mask, _mm256_srli_epi32( w0 , 10) ) );
w1 = _mm256_lddqu_si256 (compressed + 9);
_mm256_storeu_si256(out + 20,
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w0 , 24) ,_mm256_slli_epi32( w1 , 8 ) ) ) );
_mm256_storeu_si256(out + 21, _mm256_and_si256 ( mask, _mm256_srli_epi32( w1 , 6) ) );
w0 = _mm256_lddqu_si256 (compressed + 10);
_mm256_storeu_si256(out + 22,
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w1 , 20) ,_mm256_slli_epi32( w0 , 12 ) ) ) );
_mm256_storeu_si256(out + 23, _mm256_and_si256 ( mask, _mm256_srli_epi32( w0 , 2) ) );
_mm256_storeu_si256(out + 24, _mm256_and_si256 ( mask, _mm256_srli_epi32( w0 , 16) ) );
w1 = _mm256_lddqu_si256 (compressed + 11);
_mm256_storeu_si256(out + 25,
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w0 , 30) ,_mm256_slli_epi32( w1 , 2 ) ) ) );
_mm256_storeu_si256(out + 26, _mm256_and_si256 ( mask, _mm256_srli_epi32( w1 , 12) ) );
w0 = _mm256_lddqu_si256 (compressed + 12);
_mm256_storeu_si256(out + 27,
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w1 , 26) ,_mm256_slli_epi32( w0 , 6 ) ) ) );
_mm256_storeu_si256(out + 28, _mm256_and_si256 ( mask, _mm256_srli_epi32( w0 , 8) ) );
w1 = _mm256_lddqu_si256 (compressed + 13);
_mm256_storeu_si256(out + 29,
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w0 , 22) ,_mm256_slli_epi32( w1 , 10 ) ) ) );
_mm256_storeu_si256(out + 30, _mm256_and_si256 ( mask, _mm256_srli_epi32( w1 , 4) ) );
_mm256_storeu_si256(out + 31, _mm256_srli_epi32( w1 , 18) );
}
/* we packed 256 15-bit values, touching 15 256-bit words, using 240 bytes */
static void avxunpackblock15(const __m256i * compressed, uint32_t * pout) {
/* we are going to access 15 256-bit words */
__m256i w0, w1;
__m256i * out = (__m256i *) pout;
const __m256i mask = _mm256_set1_epi32(32767);
w0 = _mm256_lddqu_si256 (compressed);
_mm256_storeu_si256(out + 0, _mm256_and_si256 ( mask, w0 ) );
_mm256_storeu_si256(out + 1, _mm256_and_si256 ( mask, _mm256_srli_epi32( w0 , 15) ) );
w1 = _mm256_lddqu_si256 (compressed + 1);
_mm256_storeu_si256(out + 2,
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w0 , 30) ,_mm256_slli_epi32( w1 , 2 ) ) ) );
_mm256_storeu_si256(out + 3, _mm256_and_si256 ( mask, _mm256_srli_epi32( w1 , 13) ) );
w0 = _mm256_lddqu_si256 (compressed + 2);
_mm256_storeu_si256(out + 4,
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w1 , 28) ,_mm256_slli_epi32( w0 , 4 ) ) ) );
_mm256_storeu_si256(out + 5, _mm256_and_si256 ( mask, _mm256_srli_epi32( w0 , 11) ) );
w1 = _mm256_lddqu_si256 (compressed + 3);
_mm256_storeu_si256(out + 6,
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w0 , 26) ,_mm256_slli_epi32( w1 , 6 ) ) ) );
_mm256_storeu_si256(out + 7, _mm256_and_si256 ( mask, _mm256_srli_epi32( w1 , 9) ) );
w0 = _mm256_lddqu_si256 (compressed + 4);
_mm256_storeu_si256(out + 8,
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w1 , 24) ,_mm256_slli_epi32( w0 , 8 ) ) ) );
_mm256_storeu_si256(out + 9, _mm256_and_si256 ( mask, _mm256_srli_epi32( w0 , 7) ) );
w1 = _mm256_lddqu_si256 (compressed + 5);
_mm256_storeu_si256(out + 10,
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w0 , 22) ,_mm256_slli_epi32( w1 , 10 ) ) ) );
_mm256_storeu_si256(out + 11, _mm256_and_si256 ( mask, _mm256_srli_epi32( w1 , 5) ) );
w0 = _mm256_lddqu_si256 (compressed + 6);
_mm256_storeu_si256(out + 12,
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w1 , 20) ,_mm256_slli_epi32( w0 , 12 ) ) ) );
_mm256_storeu_si256(out + 13, _mm256_and_si256 ( mask, _mm256_srli_epi32( w0 , 3) ) );
w1 = _mm256_lddqu_si256 (compressed + 7);
_mm256_storeu_si256(out + 14,
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w0 , 18) ,_mm256_slli_epi32( w1 , 14 ) ) ) );
_mm256_storeu_si256(out + 15, _mm256_and_si256 ( mask, _mm256_srli_epi32( w1 , 1) ) );
_mm256_storeu_si256(out + 16, _mm256_and_si256 ( mask, _mm256_srli_epi32( w1 , 16) ) );
w0 = _mm256_lddqu_si256 (compressed + 8);
_mm256_storeu_si256(out + 17,
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w1 , 31) ,_mm256_slli_epi32( w0 , 1 ) ) ) );
_mm256_storeu_si256(out + 18, _mm256_and_si256 ( mask, _mm256_srli_epi32( w0 , 14) ) );
w1 = _mm256_lddqu_si256 (compressed + 9);
_mm256_storeu_si256(out + 19,
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w0 , 29) ,_mm256_slli_epi32( w1 , 3 ) ) ) );
_mm256_storeu_si256(out + 20, _mm256_and_si256 ( mask, _mm256_srli_epi32( w1 , 12) ) );
w0 = _mm256_lddqu_si256 (compressed + 10);
_mm256_storeu_si256(out + 21,
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w1 , 27) ,_mm256_slli_epi32( w0 , 5 ) ) ) );
_mm256_storeu_si256(out + 22, _mm256_and_si256 ( mask, _mm256_srli_epi32( w0 , 10) ) );
w1 = _mm256_lddqu_si256 (compressed + 11);
_mm256_storeu_si256(out + 23,
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w0 , 25) ,_mm256_slli_epi32( w1 , 7 ) ) ) );
_mm256_storeu_si256(out + 24, _mm256_and_si256 ( mask, _mm256_srli_epi32( w1 , 8) ) );
w0 = _mm256_lddqu_si256 (compressed + 12);
_mm256_storeu_si256(out + 25,
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w1 , 23) ,_mm256_slli_epi32( w0 , 9 ) ) ) );
_mm256_storeu_si256(out + 26, _mm256_and_si256 ( mask, _mm256_srli_epi32( w0 , 6) ) );
w1 = _mm256_lddqu_si256 (compressed + 13);
_mm256_storeu_si256(out + 27,
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w0 , 21) ,_mm256_slli_epi32( w1 , 11 ) ) ) );
_mm256_storeu_si256(out + 28, _mm256_and_si256 ( mask, _mm256_srli_epi32( w1 , 4) ) );
w0 = _mm256_lddqu_si256 (compressed + 14);
_mm256_storeu_si256(out + 29,
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w1 , 19) ,_mm256_slli_epi32( w0 , 13 ) ) ) );
_mm256_storeu_si256(out + 30, _mm256_and_si256 ( mask, _mm256_srli_epi32( w0 , 2) ) );
_mm256_storeu_si256(out + 31, _mm256_srli_epi32( w0 , 17) );
}
/* we packed 256 16-bit values, touching 16 256-bit words, using 256 bytes */
static void avxunpackblock16(const __m256i * compressed, uint32_t * pout) {
/* we are going to access 16 256-bit words */
__m256i w0, w1;
__m256i * out = (__m256i *) pout;
const __m256i mask = _mm256_set1_epi32(65535);
w0 = _mm256_lddqu_si256 (compressed);
_mm256_storeu_si256(out + 0, _mm256_and_si256 ( mask, w0 ) );
_mm256_storeu_si256(out + 1, _mm256_srli_epi32( w0 , 16) );
w1 = _mm256_lddqu_si256 (compressed + 1);
_mm256_storeu_si256(out + 2, _mm256_and_si256 ( mask, w1 ) );
_mm256_storeu_si256(out + 3, _mm256_srli_epi32( w1 , 16) );
w0 = _mm256_lddqu_si256 (compressed + 2);
_mm256_storeu_si256(out + 4, _mm256_and_si256 ( mask, w0 ) );
_mm256_storeu_si256(out + 5, _mm256_srli_epi32( w0 , 16) );
w1 = _mm256_lddqu_si256 (compressed + 3);
_mm256_storeu_si256(out + 6, _mm256_and_si256 ( mask, w1 ) );
_mm256_storeu_si256(out + 7, _mm256_srli_epi32( w1 , 16) );
w0 = _mm256_lddqu_si256 (compressed + 4);
_mm256_storeu_si256(out + 8, _mm256_and_si256 ( mask, w0 ) );
_mm256_storeu_si256(out + 9, _mm256_srli_epi32( w0 , 16) );
w1 = _mm256_lddqu_si256 (compressed + 5);
_mm256_storeu_si256(out + 10, _mm256_and_si256 ( mask, w1 ) );
_mm256_storeu_si256(out + 11, _mm256_srli_epi32( w1 , 16) );
w0 = _mm256_lddqu_si256 (compressed + 6);
_mm256_storeu_si256(out + 12, _mm256_and_si256 ( mask, w0 ) );
_mm256_storeu_si256(out + 13, _mm256_srli_epi32( w0 , 16) );
w1 = _mm256_lddqu_si256 (compressed + 7);
_mm256_storeu_si256(out + 14, _mm256_and_si256 ( mask, w1 ) );
_mm256_storeu_si256(out + 15, _mm256_srli_epi32( w1 , 16) );
w0 = _mm256_lddqu_si256 (compressed + 8);
_mm256_storeu_si256(out + 16, _mm256_and_si256 ( mask, w0 ) );
_mm256_storeu_si256(out + 17, _mm256_srli_epi32( w0 , 16) );
w1 = _mm256_lddqu_si256 (compressed + 9);
_mm256_storeu_si256(out + 18, _mm256_and_si256 ( mask, w1 ) );
_mm256_storeu_si256(out + 19, _mm256_srli_epi32( w1 , 16) );
w0 = _mm256_lddqu_si256 (compressed + 10);
_mm256_storeu_si256(out + 20, _mm256_and_si256 ( mask, w0 ) );
_mm256_storeu_si256(out + 21, _mm256_srli_epi32( w0 , 16) );
w1 = _mm256_lddqu_si256 (compressed + 11);
_mm256_storeu_si256(out + 22, _mm256_and_si256 ( mask, w1 ) );
_mm256_storeu_si256(out + 23, _mm256_srli_epi32( w1 , 16) );
w0 = _mm256_lddqu_si256 (compressed + 12);
_mm256_storeu_si256(out + 24, _mm256_and_si256 ( mask, w0 ) );
_mm256_storeu_si256(out + 25, _mm256_srli_epi32( w0 , 16) );
w1 = _mm256_lddqu_si256 (compressed + 13);
_mm256_storeu_si256(out + 26, _mm256_and_si256 ( mask, w1 ) );
_mm256_storeu_si256(out + 27, _mm256_srli_epi32( w1 , 16) );
w0 = _mm256_lddqu_si256 (compressed + 14);
_mm256_storeu_si256(out + 28, _mm256_and_si256 ( mask, w0 ) );
_mm256_storeu_si256(out + 29, _mm256_srli_epi32( w0 , 16) );
w1 = _mm256_lddqu_si256 (compressed + 15);
_mm256_storeu_si256(out + 30, _mm256_and_si256 ( mask, w1 ) );
_mm256_storeu_si256(out + 31, _mm256_srli_epi32( w1 , 16) );
}
/* we packed 256 17-bit values, touching 17 256-bit words, using 272 bytes */
static void avxunpackblock17(const __m256i * compressed, uint32_t * pout) {
/* we are going to access 17 256-bit words */
__m256i w0, w1;
__m256i * out = (__m256i *) pout;
const __m256i mask = _mm256_set1_epi32(131071);
w0 = _mm256_lddqu_si256 (compressed);
_mm256_storeu_si256(out + 0, _mm256_and_si256 ( mask, w0 ) );
w1 = _mm256_lddqu_si256 (compressed + 1);
_mm256_storeu_si256(out + 1,
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w0 , 17) ,_mm256_slli_epi32( w1 , 15 ) ) ) );
_mm256_storeu_si256(out + 2, _mm256_and_si256 ( mask, _mm256_srli_epi32( w1 , 2) ) );
w0 = _mm256_lddqu_si256 (compressed + 2);
_mm256_storeu_si256(out + 3,
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w1 , 19) ,_mm256_slli_epi32( w0 , 13 ) ) ) );
_mm256_storeu_si256(out + 4, _mm256_and_si256 ( mask, _mm256_srli_epi32( w0 , 4) ) );
w1 = _mm256_lddqu_si256 (compressed + 3);
_mm256_storeu_si256(out + 5,
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w0 , 21) ,_mm256_slli_epi32( w1 , 11 ) ) ) );
_mm256_storeu_si256(out + 6, _mm256_and_si256 ( mask, _mm256_srli_epi32( w1 , 6) ) );
w0 = _mm256_lddqu_si256 (compressed + 4);
_mm256_storeu_si256(out + 7,
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w1 , 23) ,_mm256_slli_epi32( w0 , 9 ) ) ) );
_mm256_storeu_si256(out + 8, _mm256_and_si256 ( mask, _mm256_srli_epi32( w0 , 8) ) );
w1 = _mm256_lddqu_si256 (compressed + 5);
_mm256_storeu_si256(out + 9,
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w0 , 25) ,_mm256_slli_epi32( w1 , 7 ) ) ) );
_mm256_storeu_si256(out + 10, _mm256_and_si256 ( mask, _mm256_srli_epi32( w1 , 10) ) );
w0 = _mm256_lddqu_si256 (compressed + 6);
_mm256_storeu_si256(out + 11,
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w1 , 27) ,_mm256_slli_epi32( w0 , 5 ) ) ) );
_mm256_storeu_si256(out + 12, _mm256_and_si256 ( mask, _mm256_srli_epi32( w0 , 12) ) );
w1 = _mm256_lddqu_si256 (compressed + 7);
_mm256_storeu_si256(out + 13,
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w0 , 29) ,_mm256_slli_epi32( w1 , 3 ) ) ) );
_mm256_storeu_si256(out + 14, _mm256_and_si256 ( mask, _mm256_srli_epi32( w1 , 14) ) );
w0 = _mm256_lddqu_si256 (compressed + 8);
_mm256_storeu_si256(out + 15,
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w1 , 31) ,_mm256_slli_epi32( w0 , 1 ) ) ) );
w1 = _mm256_lddqu_si256 (compressed + 9);
_mm256_storeu_si256(out + 16,
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w0 , 16) ,_mm256_slli_epi32( w1 , 16 ) ) ) );
_mm256_storeu_si256(out + 17, _mm256_and_si256 ( mask, _mm256_srli_epi32( w1 , 1) ) );
w0 = _mm256_lddqu_si256 (compressed + 10);
_mm256_storeu_si256(out + 18,
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w1 , 18) ,_mm256_slli_epi32( w0 , 14 ) ) ) );
_mm256_storeu_si256(out + 19, _mm256_and_si256 ( mask, _mm256_srli_epi32( w0 , 3) ) );
w1 = _mm256_lddqu_si256 (compressed + 11);
_mm256_storeu_si256(out + 20,
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w0 , 20) ,_mm256_slli_epi32( w1 , 12 ) ) ) );
_mm256_storeu_si256(out + 21, _mm256_and_si256 ( mask, _mm256_srli_epi32( w1 , 5) ) );
w0 = _mm256_lddqu_si256 (compressed + 12);
_mm256_storeu_si256(out + 22,
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w1 , 22) ,_mm256_slli_epi32( w0 , 10 ) ) ) );
_mm256_storeu_si256(out + 23, _mm256_and_si256 ( mask, _mm256_srli_epi32( w0 , 7) ) );
w1 = _mm256_lddqu_si256 (compressed + 13);
_mm256_storeu_si256(out + 24,
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w0 , 24) ,_mm256_slli_epi32( w1 , 8 ) ) ) );
_mm256_storeu_si256(out + 25, _mm256_and_si256 ( mask, _mm256_srli_epi32( w1 , 9) ) );
w0 = _mm256_lddqu_si256 (compressed + 14);
_mm256_storeu_si256(out + 26,
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w1 , 26) ,_mm256_slli_epi32( w0 , 6 ) ) ) );
_mm256_storeu_si256(out + 27, _mm256_and_si256 ( mask, _mm256_srli_epi32( w0 , 11) ) );
w1 = _mm256_lddqu_si256 (compressed + 15);
_mm256_storeu_si256(out + 28,
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w0 , 28) ,_mm256_slli_epi32( w1 , 4 ) ) ) );
_mm256_storeu_si256(out + 29, _mm256_and_si256 ( mask, _mm256_srli_epi32( w1 , 13) ) );
w0 = _mm256_lddqu_si256 (compressed + 16);
_mm256_storeu_si256(out + 30,
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w1 , 30) ,_mm256_slli_epi32( w0 , 2 ) ) ) );
_mm256_storeu_si256(out + 31, _mm256_srli_epi32( w0 , 15) );
}
/* we packed 256 18-bit values, touching 18 256-bit words, using 288 bytes */
static void avxunpackblock18(const __m256i * compressed, uint32_t * pout) {
/* we are going to access 18 256-bit words */
__m256i w0, w1;
__m256i * out = (__m256i *) pout;
const __m256i mask = _mm256_set1_epi32(262143);
w0 = _mm256_lddqu_si256 (compressed);
_mm256_storeu_si256(out + 0, _mm256_and_si256 ( mask, w0 ) );
w1 = _mm256_lddqu_si256 (compressed + 1);
_mm256_storeu_si256(out + 1,
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w0 , 18) ,_mm256_slli_epi32( w1 , 14 ) ) ) );
_mm256_storeu_si256(out + 2, _mm256_and_si256 ( mask, _mm256_srli_epi32( w1 , 4) ) );
w0 = _mm256_lddqu_si256 (compressed + 2);
_mm256_storeu_si256(out + 3,
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w1 , 22) ,_mm256_slli_epi32( w0 , 10 ) ) ) );
_mm256_storeu_si256(out + 4, _mm256_and_si256 ( mask, _mm256_srli_epi32( w0 , 8) ) );
w1 = _mm256_lddqu_si256 (compressed + 3);
_mm256_storeu_si256(out + 5,
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w0 , 26) ,_mm256_slli_epi32( w1 , 6 ) ) ) );
_mm256_storeu_si256(out + 6, _mm256_and_si256 ( mask, _mm256_srli_epi32( w1 , 12) ) );
w0 = _mm256_lddqu_si256 (compressed + 4);
_mm256_storeu_si256(out + 7,
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w1 , 30) ,_mm256_slli_epi32( w0 , 2 ) ) ) );
w1 = _mm256_lddqu_si256 (compressed + 5);
_mm256_storeu_si256(out + 8,
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w0 , 16) ,_mm256_slli_epi32( w1 , 16 ) ) ) );
_mm256_storeu_si256(out + 9, _mm256_and_si256 ( mask, _mm256_srli_epi32( w1 , 2) ) );
w0 = _mm256_lddqu_si256 (compressed + 6);
_mm256_storeu_si256(out + 10,
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w1 , 20) ,_mm256_slli_epi32( w0 , 12 ) ) ) );
_mm256_storeu_si256(out + 11, _mm256_and_si256 ( mask, _mm256_srli_epi32( w0 , 6) ) );
w1 = _mm256_lddqu_si256 (compressed + 7);
_mm256_storeu_si256(out + 12,
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w0 , 24) ,_mm256_slli_epi32( w1 , 8 ) ) ) );
_mm256_storeu_si256(out + 13, _mm256_and_si256 ( mask, _mm256_srli_epi32( w1 , 10) ) );
w0 = _mm256_lddqu_si256 (compressed + 8);
_mm256_storeu_si256(out + 14,
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w1 , 28) ,_mm256_slli_epi32( w0 , 4 ) ) ) );
_mm256_storeu_si256(out + 15, _mm256_srli_epi32( w0 , 14) );
w1 = _mm256_lddqu_si256 (compressed + 9);
_mm256_storeu_si256(out + 16, _mm256_and_si256 ( mask, w1 ) );
w0 = _mm256_lddqu_si256 (compressed + 10);
_mm256_storeu_si256(out + 17,
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w1 , 18) ,_mm256_slli_epi32( w0 , 14 ) ) ) );
_mm256_storeu_si256(out + 18, _mm256_and_si256 ( mask, _mm256_srli_epi32( w0 , 4) ) );
w1 = _mm256_lddqu_si256 (compressed + 11);
_mm256_storeu_si256(out + 19,
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w0 , 22) ,_mm256_slli_epi32( w1 , 10 ) ) ) );
_mm256_storeu_si256(out + 20, _mm256_and_si256 ( mask, _mm256_srli_epi32( w1 , 8) ) );
w0 = _mm256_lddqu_si256 (compressed + 12);
_mm256_storeu_si256(out + 21,
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w1 , 26) ,_mm256_slli_epi32( w0 , 6 ) ) ) );
_mm256_storeu_si256(out + 22, _mm256_and_si256 ( mask, _mm256_srli_epi32( w0 , 12) ) );
w1 = _mm256_lddqu_si256 (compressed + 13);
_mm256_storeu_si256(out + 23,
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w0 , 30) ,_mm256_slli_epi32( w1 , 2 ) ) ) );
w0 = _mm256_lddqu_si256 (compressed + 14);
_mm256_storeu_si256(out + 24,
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w1 , 16) ,_mm256_slli_epi32( w0 , 16 ) ) ) );
_mm256_storeu_si256(out + 25, _mm256_and_si256 ( mask, _mm256_srli_epi32( w0 , 2) ) );
w1 = _mm256_lddqu_si256 (compressed + 15);
_mm256_storeu_si256(out + 26,
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w0 , 20) ,_mm256_slli_epi32( w1 , 12 ) ) ) );
_mm256_storeu_si256(out + 27, _mm256_and_si256 ( mask, _mm256_srli_epi32( w1 , 6) ) );
w0 = _mm256_lddqu_si256 (compressed + 16);
_mm256_storeu_si256(out + 28,
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w1 , 24) ,_mm256_slli_epi32( w0 , 8 ) ) ) );
_mm256_storeu_si256(out + 29, _mm256_and_si256 ( mask, _mm256_srli_epi32( w0 , 10) ) );
w1 = _mm256_lddqu_si256 (compressed + 17);
_mm256_storeu_si256(out + 30,
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w0 , 28) ,_mm256_slli_epi32( w1 , 4 ) ) ) );
_mm256_storeu_si256(out + 31, _mm256_srli_epi32( w1 , 14) );
}
/* we packed 256 19-bit values, touching 19 256-bit words, using 304 bytes */
static void avxunpackblock19(const __m256i * compressed, uint32_t * pout) {
/* we are going to access 19 256-bit words */
__m256i w0, w1;
__m256i * out = (__m256i *) pout;
const __m256i mask = _mm256_set1_epi32(524287);
w0 = _mm256_lddqu_si256 (compressed);
_mm256_storeu_si256(out + 0, _mm256_and_si256 ( mask, w0 ) );
w1 = _mm256_lddqu_si256 (compressed + 1);
_mm256_storeu_si256(out + 1,
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w0 , 19) ,_mm256_slli_epi32( w1 , 13 ) ) ) );
_mm256_storeu_si256(out + 2, _mm256_and_si256 ( mask, _mm256_srli_epi32( w1 , 6) ) );
w0 = _mm256_lddqu_si256 (compressed + 2);
_mm256_storeu_si256(out + 3,
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w1 , 25) ,_mm256_slli_epi32( w0 , 7 ) ) ) );
_mm256_storeu_si256(out + 4, _mm256_and_si256 ( mask, _mm256_srli_epi32( w0 , 12) ) );
w1 = _mm256_lddqu_si256 (compressed + 3);
_mm256_storeu_si256(out + 5,
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w0 , 31) ,_mm256_slli_epi32( w1 , 1 ) ) ) );
w0 = _mm256_lddqu_si256 (compressed + 4);
_mm256_storeu_si256(out + 6,
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w1 , 18) ,_mm256_slli_epi32( w0 , 14 ) ) ) );
_mm256_storeu_si256(out + 7, _mm256_and_si256 ( mask, _mm256_srli_epi32( w0 , 5) ) );
w1 = _mm256_lddqu_si256 (compressed + 5);
_mm256_storeu_si256(out + 8,
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w0 , 24) ,_mm256_slli_epi32( w1 , 8 ) ) ) );
_mm256_storeu_si256(out + 9, _mm256_and_si256 ( mask, _mm256_srli_epi32( w1 , 11) ) );
w0 = _mm256_lddqu_si256 (compressed + 6);
_mm256_storeu_si256(out + 10,
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w1 , 30) ,_mm256_slli_epi32( w0 , 2 ) ) ) );
w1 = _mm256_lddqu_si256 (compressed + 7);
_mm256_storeu_si256(out + 11,
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w0 , 17) ,_mm256_slli_epi32( w1 , 15 ) ) ) );
_mm256_storeu_si256(out + 12, _mm256_and_si256 ( mask, _mm256_srli_epi32( w1 , 4) ) );
w0 = _mm256_lddqu_si256 (compressed + 8);
_mm256_storeu_si256(out + 13,
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w1 , 23) ,_mm256_slli_epi32( w0 , 9 ) ) ) );
_mm256_storeu_si256(out + 14, _mm256_and_si256 ( mask, _mm256_srli_epi32( w0 , 10) ) );
w1 = _mm256_lddqu_si256 (compressed + 9);
_mm256_storeu_si256(out + 15,
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w0 , 29) ,_mm256_slli_epi32( w1 , 3 ) ) ) );
w0 = _mm256_lddqu_si256 (compressed + 10);
_mm256_storeu_si256(out + 16,
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w1 , 16) ,_mm256_slli_epi32( w0 , 16 ) ) ) );
_mm256_storeu_si256(out + 17, _mm256_and_si256 ( mask, _mm256_srli_epi32( w0 , 3) ) );
w1 = _mm256_lddqu_si256 (compressed + 11);
_mm256_storeu_si256(out + 18,
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w0 , 22) ,_mm256_slli_epi32( w1 , 10 ) ) ) );
_mm256_storeu_si256(out + 19, _mm256_and_si256 ( mask, _mm256_srli_epi32( w1 , 9) ) );
w0 = _mm256_lddqu_si256 (compressed + 12);
_mm256_storeu_si256(out + 20,
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w1 , 28) ,_mm256_slli_epi32( w0 , 4 ) ) ) );
w1 = _mm256_lddqu_si256 (compressed + 13);
_mm256_storeu_si256(out + 21,
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w0 , 15) ,_mm256_slli_epi32( w1 , 17 ) ) ) );
_mm256_storeu_si256(out + 22, _mm256_and_si256 ( mask, _mm256_srli_epi32( w1 , 2) ) );
w0 = _mm256_lddqu_si256 (compressed + 14);
_mm256_storeu_si256(out + 23,
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w1 , 21) ,_mm256_slli_epi32( w0 , 11 ) ) ) );
_mm256_storeu_si256(out + 24, _mm256_and_si256 ( mask, _mm256_srli_epi32( w0 , 8) ) );
w1 = _mm256_lddqu_si256 (compressed + 15);
_mm256_storeu_si256(out + 25,
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w0 , 27) ,_mm256_slli_epi32( w1 , 5 ) ) ) );
w0 = _mm256_lddqu_si256 (compressed + 16);
_mm256_storeu_si256(out + 26,
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w1 , 14) ,_mm256_slli_epi32( w0 , 18 ) ) ) );
_mm256_storeu_si256(out + 27, _mm256_and_si256 ( mask, _mm256_srli_epi32( w0 , 1) ) );
w1 = _mm256_lddqu_si256 (compressed + 17);
_mm256_storeu_si256(out + 28,
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w0 , 20) ,_mm256_slli_epi32( w1 , 12 ) ) ) );
_mm256_storeu_si256(out + 29, _mm256_and_si256 ( mask, _mm256_srli_epi32( w1 , 7) ) );
w0 = _mm256_lddqu_si256 (compressed + 18);
_mm256_storeu_si256(out + 30,
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w1 , 26) ,_mm256_slli_epi32( w0 , 6 ) ) ) );
_mm256_storeu_si256(out + 31, _mm256_srli_epi32( w0 , 13) );
}
/* we packed 256 20-bit values, touching 20 256-bit words, using 320 bytes */
static void avxunpackblock20(const __m256i * compressed, uint32_t * pout) {
/* we are going to access 20 256-bit words */
__m256i w0, w1;
__m256i * out = (__m256i *) pout;
const __m256i mask = _mm256_set1_epi32(1048575);
w0 = _mm256_lddqu_si256 (compressed);
_mm256_storeu_si256(out + 0, _mm256_and_si256 ( mask, w0 ) );
w1 = _mm256_lddqu_si256 (compressed + 1);
_mm256_storeu_si256(out + 1,
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w0 , 20) ,_mm256_slli_epi32( w1 , 12 ) ) ) );
_mm256_storeu_si256(out + 2, _mm256_and_si256 ( mask, _mm256_srli_epi32( w1 , 8) ) );
w0 = _mm256_lddqu_si256 (compressed + 2);
_mm256_storeu_si256(out + 3,
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w1 , 28) ,_mm256_slli_epi32( w0 , 4 ) ) ) );
w1 = _mm256_lddqu_si256 (compressed + 3);
_mm256_storeu_si256(out + 4,
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w0 , 16) ,_mm256_slli_epi32( w1 , 16 ) ) ) );
_mm256_storeu_si256(out + 5, _mm256_and_si256 ( mask, _mm256_srli_epi32( w1 , 4) ) );
w0 = _mm256_lddqu_si256 (compressed + 4);
_mm256_storeu_si256(out + 6,
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w1 , 24) ,_mm256_slli_epi32( w0 , 8 ) ) ) );
_mm256_storeu_si256(out + 7, _mm256_srli_epi32( w0 , 12) );
w1 = _mm256_lddqu_si256 (compressed + 5);
_mm256_storeu_si256(out + 8, _mm256_and_si256 ( mask, w1 ) );
w0 = _mm256_lddqu_si256 (compressed + 6);
_mm256_storeu_si256(out + 9,
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w1 , 20) ,_mm256_slli_epi32( w0 , 12 ) ) ) );
_mm256_storeu_si256(out + 10, _mm256_and_si256 ( mask, _mm256_srli_epi32( w0 , 8) ) );
w1 = _mm256_lddqu_si256 (compressed + 7);
_mm256_storeu_si256(out + 11,
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w0 , 28) ,_mm256_slli_epi32( w1 , 4 ) ) ) );
w0 = _mm256_lddqu_si256 (compressed + 8);
_mm256_storeu_si256(out + 12,
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w1 , 16) ,_mm256_slli_epi32( w0 , 16 ) ) ) );
_mm256_storeu_si256(out + 13, _mm256_and_si256 ( mask, _mm256_srli_epi32( w0 , 4) ) );
w1 = _mm256_lddqu_si256 (compressed + 9);
_mm256_storeu_si256(out + 14,
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w0 , 24) ,_mm256_slli_epi32( w1 , 8 ) ) ) );
_mm256_storeu_si256(out + 15, _mm256_srli_epi32( w1 , 12) );
w0 = _mm256_lddqu_si256 (compressed + 10);
_mm256_storeu_si256(out + 16, _mm256_and_si256 ( mask, w0 ) );
w1 = _mm256_lddqu_si256 (compressed + 11);
_mm256_storeu_si256(out + 17,
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w0 , 20) ,_mm256_slli_epi32( w1 , 12 ) ) ) );
_mm256_storeu_si256(out + 18, _mm256_and_si256 ( mask, _mm256_srli_epi32( w1 , 8) ) );
w0 = _mm256_lddqu_si256 (compressed + 12);
_mm256_storeu_si256(out + 19,
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w1 , 28) ,_mm256_slli_epi32( w0 , 4 ) ) ) );
w1 = _mm256_lddqu_si256 (compressed + 13);
_mm256_storeu_si256(out + 20,
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w0 , 16) ,_mm256_slli_epi32( w1 , 16 ) ) ) );
_mm256_storeu_si256(out + 21, _mm256_and_si256 ( mask, _mm256_srli_epi32( w1 , 4) ) );
w0 = _mm256_lddqu_si256 (compressed + 14);
_mm256_storeu_si256(out + 22,
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w1 , 24) ,_mm256_slli_epi32( w0 , 8 ) ) ) );
_mm256_storeu_si256(out + 23, _mm256_srli_epi32( w0 , 12) );
w1 = _mm256_lddqu_si256 (compressed + 15);
_mm256_storeu_si256(out + 24, _mm256_and_si256 ( mask, w1 ) );
w0 = _mm256_lddqu_si256 (compressed + 16);
_mm256_storeu_si256(out + 25,
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w1 , 20) ,_mm256_slli_epi32( w0 , 12 ) ) ) );
_mm256_storeu_si256(out + 26, _mm256_and_si256 ( mask, _mm256_srli_epi32( w0 , 8) ) );
w1 = _mm256_lddqu_si256 (compressed + 17);
_mm256_storeu_si256(out + 27,
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w0 , 28) ,_mm256_slli_epi32( w1 , 4 ) ) ) );
w0 = _mm256_lddqu_si256 (compressed + 18);
_mm256_storeu_si256(out + 28,
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w1 , 16) ,_mm256_slli_epi32( w0 , 16 ) ) ) );
_mm256_storeu_si256(out + 29, _mm256_and_si256 ( mask, _mm256_srli_epi32( w0 , 4) ) );
w1 = _mm256_lddqu_si256 (compressed + 19);
_mm256_storeu_si256(out + 30,
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w0 , 24) ,_mm256_slli_epi32( w1 , 8 ) ) ) );
_mm256_storeu_si256(out + 31, _mm256_srli_epi32( w1 , 12) );
}
/* we packed 256 21-bit values, touching 21 256-bit words, using 336 bytes */
static void avxunpackblock21(const __m256i * compressed, uint32_t * pout) {
/* we are going to access 21 256-bit words */
__m256i w0, w1;
__m256i * out = (__m256i *) pout;
const __m256i mask = _mm256_set1_epi32(2097151);
w0 = _mm256_lddqu_si256 (compressed);
_mm256_storeu_si256(out + 0, _mm256_and_si256 ( mask, w0 ) );
w1 = _mm256_lddqu_si256 (compressed + 1);
_mm256_storeu_si256(out + 1,
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w0 , 21) ,_mm256_slli_epi32( w1 , 11 ) ) ) );
_mm256_storeu_si256(out + 2, _mm256_and_si256 ( mask, _mm256_srli_epi32( w1 , 10) ) );
w0 = _mm256_lddqu_si256 (compressed + 2);
_mm256_storeu_si256(out + 3,
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w1 , 31) ,_mm256_slli_epi32( w0 , 1 ) ) ) );
w1 = _mm256_lddqu_si256 (compressed + 3);
_mm256_storeu_si256(out + 4,
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w0 , 20) ,_mm256_slli_epi32( w1 , 12 ) ) ) );
_mm256_storeu_si256(out + 5, _mm256_and_si256 ( mask, _mm256_srli_epi32( w1 , 9) ) );
w0 = _mm256_lddqu_si256 (compressed + 4);
_mm256_storeu_si256(out + 6,
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w1 , 30) ,_mm256_slli_epi32( w0 , 2 ) ) ) );
w1 = _mm256_lddqu_si256 (compressed + 5);
_mm256_storeu_si256(out + 7,
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w0 , 19) ,_mm256_slli_epi32( w1 , 13 ) ) ) );
_mm256_storeu_si256(out + 8, _mm256_and_si256 ( mask, _mm256_srli_epi32( w1 , 8) ) );
w0 = _mm256_lddqu_si256 (compressed + 6);
_mm256_storeu_si256(out + 9,
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w1 , 29) ,_mm256_slli_epi32( w0 , 3 ) ) ) );
w1 = _mm256_lddqu_si256 (compressed + 7);
_mm256_storeu_si256(out + 10,
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w0 , 18) ,_mm256_slli_epi32( w1 , 14 ) ) ) );
_mm256_storeu_si256(out + 11, _mm256_and_si256 ( mask, _mm256_srli_epi32( w1 , 7) ) );
w0 = _mm256_lddqu_si256 (compressed + 8);
_mm256_storeu_si256(out + 12,
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w1 , 28) ,_mm256_slli_epi32( w0 , 4 ) ) ) );
w1 = _mm256_lddqu_si256 (compressed + 9);
_mm256_storeu_si256(out + 13,
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w0 , 17) ,_mm256_slli_epi32( w1 , 15 ) ) ) );
_mm256_storeu_si256(out + 14, _mm256_and_si256 ( mask, _mm256_srli_epi32( w1 , 6) ) );
w0 = _mm256_lddqu_si256 (compressed + 10);
_mm256_storeu_si256(out + 15,
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w1 , 27) ,_mm256_slli_epi32( w0 , 5 ) ) ) );
w1 = _mm256_lddqu_si256 (compressed + 11);
_mm256_storeu_si256(out + 16,
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w0 , 16) ,_mm256_slli_epi32( w1 , 16 ) ) ) );
_mm256_storeu_si256(out + 17, _mm256_and_si256 ( mask, _mm256_srli_epi32( w1 , 5) ) );
w0 = _mm256_lddqu_si256 (compressed + 12);
_mm256_storeu_si256(out + 18,
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w1 , 26) ,_mm256_slli_epi32( w0 , 6 ) ) ) );
w1 = _mm256_lddqu_si256 (compressed + 13);
_mm256_storeu_si256(out + 19,
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w0 , 15) ,_mm256_slli_epi32( w1 , 17 ) ) ) );
_mm256_storeu_si256(out + 20, _mm256_and_si256 ( mask, _mm256_srli_epi32( w1 , 4) ) );
w0 = _mm256_lddqu_si256 (compressed + 14);
_mm256_storeu_si256(out + 21,
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w1 , 25) ,_mm256_slli_epi32( w0 , 7 ) ) ) );
w1 = _mm256_lddqu_si256 (compressed + 15);
_mm256_storeu_si256(out + 22,
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w0 , 14) ,_mm256_slli_epi32( w1 , 18 ) ) ) );
_mm256_storeu_si256(out + 23, _mm256_and_si256 ( mask, _mm256_srli_epi32( w1 , 3) ) );
w0 = _mm256_lddqu_si256 (compressed + 16);
_mm256_storeu_si256(out + 24,
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w1 , 24) ,_mm256_slli_epi32( w0 , 8 ) ) ) );
w1 = _mm256_lddqu_si256 (compressed + 17);
_mm256_storeu_si256(out + 25,
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w0 , 13) ,_mm256_slli_epi32( w1 , 19 ) ) ) );
_mm256_storeu_si256(out + 26, _mm256_and_si256 ( mask, _mm256_srli_epi32( w1 , 2) ) );
w0 = _mm256_lddqu_si256 (compressed + 18);
_mm256_storeu_si256(out + 27,
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w1 , 23) ,_mm256_slli_epi32( w0 , 9 ) ) ) );
w1 = _mm256_lddqu_si256 (compressed + 19);
_mm256_storeu_si256(out + 28,
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w0 , 12) ,_mm256_slli_epi32( w1 , 20 ) ) ) );
_mm256_storeu_si256(out + 29, _mm256_and_si256 ( mask, _mm256_srli_epi32( w1 , 1) ) );
w0 = _mm256_lddqu_si256 (compressed + 20);
_mm256_storeu_si256(out + 30,
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w1 , 22) ,_mm256_slli_epi32( w0 , 10 ) ) ) );
_mm256_storeu_si256(out + 31, _mm256_srli_epi32( w0 , 11) );
}
/* we packed 256 22-bit values, touching 22 256-bit words, using 352 bytes */
static void avxunpackblock22(const __m256i * compressed, uint32_t * pout) {
/* we are going to access 22 256-bit words */
__m256i w0, w1;
__m256i * out = (__m256i *) pout;
const __m256i mask = _mm256_set1_epi32(4194303);
w0 = _mm256_lddqu_si256 (compressed);
_mm256_storeu_si256(out + 0, _mm256_and_si256 ( mask, w0 ) );
w1 = _mm256_lddqu_si256 (compressed + 1);
_mm256_storeu_si256(out + 1,
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w0 , 22) ,_mm256_slli_epi32( w1 , 10 ) ) ) );
w0 = _mm256_lddqu_si256 (compressed + 2);
_mm256_storeu_si256(out + 2,
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w1 , 12) ,_mm256_slli_epi32( w0 , 20 ) ) ) );
_mm256_storeu_si256(out + 3, _mm256_and_si256 ( mask, _mm256_srli_epi32( w0 , 2) ) );
w1 = _mm256_lddqu_si256 (compressed + 3);
_mm256_storeu_si256(out + 4,
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w0 , 24) ,_mm256_slli_epi32( w1 , 8 ) ) ) );
w0 = _mm256_lddqu_si256 (compressed + 4);
_mm256_storeu_si256(out + 5,
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w1 , 14) ,_mm256_slli_epi32( w0 , 18 ) ) ) );
_mm256_storeu_si256(out + 6, _mm256_and_si256 ( mask, _mm256_srli_epi32( w0 , 4) ) );
w1 = _mm256_lddqu_si256 (compressed + 5);
_mm256_storeu_si256(out + 7,
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w0 , 26) ,_mm256_slli_epi32( w1 , 6 ) ) ) );
w0 = _mm256_lddqu_si256 (compressed + 6);
_mm256_storeu_si256(out + 8,
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w1 , 16) ,_mm256_slli_epi32( w0 , 16 ) ) ) );
_mm256_storeu_si256(out + 9, _mm256_and_si256 ( mask, _mm256_srli_epi32( w0 , 6) ) );
w1 = _mm256_lddqu_si256 (compressed + 7);
_mm256_storeu_si256(out + 10,
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w0 , 28) ,_mm256_slli_epi32( w1 , 4 ) ) ) );
w0 = _mm256_lddqu_si256 (compressed + 8);
_mm256_storeu_si256(out + 11,
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w1 , 18) ,_mm256_slli_epi32( w0 , 14 ) ) ) );
_mm256_storeu_si256(out + 12, _mm256_and_si256 ( mask, _mm256_srli_epi32( w0 , 8) ) );
w1 = _mm256_lddqu_si256 (compressed + 9);
_mm256_storeu_si256(out + 13,
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w0 , 30) ,_mm256_slli_epi32( w1 , 2 ) ) ) );
w0 = _mm256_lddqu_si256 (compressed + 10);
_mm256_storeu_si256(out + 14,
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w1 , 20) ,_mm256_slli_epi32( w0 , 12 ) ) ) );
_mm256_storeu_si256(out + 15, _mm256_srli_epi32( w0 , 10) );
w1 = _mm256_lddqu_si256 (compressed + 11);
_mm256_storeu_si256(out + 16, _mm256_and_si256 ( mask, w1 ) );
w0 = _mm256_lddqu_si256 (compressed + 12);
_mm256_storeu_si256(out + 17,
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w1 , 22) ,_mm256_slli_epi32( w0 , 10 ) ) ) );
w1 = _mm256_lddqu_si256 (compressed + 13);
_mm256_storeu_si256(out + 18,
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w0 , 12) ,_mm256_slli_epi32( w1 , 20 ) ) ) );
_mm256_storeu_si256(out + 19, _mm256_and_si256 ( mask, _mm256_srli_epi32( w1 , 2) ) );
w0 = _mm256_lddqu_si256 (compressed + 14);
_mm256_storeu_si256(out + 20,
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w1 , 24) ,_mm256_slli_epi32( w0 , 8 ) ) ) );
w1 = _mm256_lddqu_si256 (compressed + 15);
_mm256_storeu_si256(out + 21,
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w0 , 14) ,_mm256_slli_epi32( w1 , 18 ) ) ) );
_mm256_storeu_si256(out + 22, _mm256_and_si256 ( mask, _mm256_srli_epi32( w1 , 4) ) );
w0 = _mm256_lddqu_si256 (compressed + 16);
_mm256_storeu_si256(out + 23,
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w1 , 26) ,_mm256_slli_epi32( w0 , 6 ) ) ) );
w1 = _mm256_lddqu_si256 (compressed + 17);
_mm256_storeu_si256(out + 24,
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w0 , 16) ,_mm256_slli_epi32( w1 , 16 ) ) ) );
_mm256_storeu_si256(out + 25, _mm256_and_si256 ( mask, _mm256_srli_epi32( w1 , 6) ) );
w0 = _mm256_lddqu_si256 (compressed + 18);
_mm256_storeu_si256(out + 26,
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w1 , 28) ,_mm256_slli_epi32( w0 , 4 ) ) ) );
w1 = _mm256_lddqu_si256 (compressed + 19);
_mm256_storeu_si256(out + 27,
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w0 , 18) ,_mm256_slli_epi32( w1 , 14 ) ) ) );
_mm256_storeu_si256(out + 28, _mm256_and_si256 ( mask, _mm256_srli_epi32( w1 , 8) ) );
w0 = _mm256_lddqu_si256 (compressed + 20);
_mm256_storeu_si256(out + 29,
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w1 , 30) ,_mm256_slli_epi32( w0 , 2 ) ) ) );
w1 = _mm256_lddqu_si256 (compressed + 21);
_mm256_storeu_si256(out + 30,
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w0 , 20) ,_mm256_slli_epi32( w1 , 12 ) ) ) );
_mm256_storeu_si256(out + 31, _mm256_srli_epi32( w1 , 10) );
}
/* we packed 256 23-bit values, touching 23 256-bit words, using 368 bytes */
static void avxunpackblock23(const __m256i * compressed, uint32_t * pout) {
/* we are going to access 23 256-bit words */
__m256i w0, w1;
__m256i * out = (__m256i *) pout;
const __m256i mask = _mm256_set1_epi32(8388607);
w0 = _mm256_lddqu_si256 (compressed);
_mm256_storeu_si256(out + 0, _mm256_and_si256 ( mask, w0 ) );
w1 = _mm256_lddqu_si256 (compressed + 1);
_mm256_storeu_si256(out + 1,
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w0 , 23) ,_mm256_slli_epi32( w1 , 9 ) ) ) );
w0 = _mm256_lddqu_si256 (compressed + 2);
_mm256_storeu_si256(out + 2,
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w1 , 14) ,_mm256_slli_epi32( w0 , 18 ) ) ) );
_mm256_storeu_si256(out + 3, _mm256_and_si256 ( mask, _mm256_srli_epi32( w0 , 5) ) );
w1 = _mm256_lddqu_si256 (compressed + 3);
_mm256_storeu_si256(out + 4,
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w0 , 28) ,_mm256_slli_epi32( w1 , 4 ) ) ) );
w0 = _mm256_lddqu_si256 (compressed + 4);
_mm256_storeu_si256(out + 5,
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w1 , 19) ,_mm256_slli_epi32( w0 , 13 ) ) ) );
w1 = _mm256_lddqu_si256 (compressed + 5);
_mm256_storeu_si256(out + 6,
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w0 , 10) ,_mm256_slli_epi32( w1 , 22 ) ) ) );
_mm256_storeu_si256(out + 7, _mm256_and_si256 ( mask, _mm256_srli_epi32( w1 , 1) ) );
w0 = _mm256_lddqu_si256 (compressed + 6);
_mm256_storeu_si256(out + 8,
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w1 , 24) ,_mm256_slli_epi32( w0 , 8 ) ) ) );
w1 = _mm256_lddqu_si256 (compressed + 7);
_mm256_storeu_si256(out + 9,
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w0 , 15) ,_mm256_slli_epi32( w1 , 17 ) ) ) );
_mm256_storeu_si256(out + 10, _mm256_and_si256 ( mask, _mm256_srli_epi32( w1 , 6) ) );
w0 = _mm256_lddqu_si256 (compressed + 8);
_mm256_storeu_si256(out + 11,
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w1 , 29) ,_mm256_slli_epi32( w0 , 3 ) ) ) );
w1 = _mm256_lddqu_si256 (compressed + 9);
_mm256_storeu_si256(out + 12,
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w0 , 20) ,_mm256_slli_epi32( w1 , 12 ) ) ) );
w0 = _mm256_lddqu_si256 (compressed + 10);
_mm256_storeu_si256(out + 13,
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w1 , 11) ,_mm256_slli_epi32( w0 , 21 ) ) ) );
_mm256_storeu_si256(out + 14, _mm256_and_si256 ( mask, _mm256_srli_epi32( w0 , 2) ) );
w1 = _mm256_lddqu_si256 (compressed + 11);
_mm256_storeu_si256(out + 15,
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w0 , 25) ,_mm256_slli_epi32( w1 , 7 ) ) ) );
w0 = _mm256_lddqu_si256 (compressed + 12);
_mm256_storeu_si256(out + 16,
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w1 , 16) ,_mm256_slli_epi32( w0 , 16 ) ) ) );
_mm256_storeu_si256(out + 17, _mm256_and_si256 ( mask, _mm256_srli_epi32( w0 , 7) ) );
w1 = _mm256_lddqu_si256 (compressed + 13);
_mm256_storeu_si256(out + 18,
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w0 , 30) ,_mm256_slli_epi32( w1 , 2 ) ) ) );
w0 = _mm256_lddqu_si256 (compressed + 14);
_mm256_storeu_si256(out + 19,
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w1 , 21) ,_mm256_slli_epi32( w0 , 11 ) ) ) );
w1 = _mm256_lddqu_si256 (compressed + 15);
_mm256_storeu_si256(out + 20,
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w0 , 12) ,_mm256_slli_epi32( w1 , 20 ) ) ) );
_mm256_storeu_si256(out + 21, _mm256_and_si256 ( mask, _mm256_srli_epi32( w1 , 3) ) );
w0 = _mm256_lddqu_si256 (compressed + 16);
_mm256_storeu_si256(out + 22,
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w1 , 26) ,_mm256_slli_epi32( w0 , 6 ) ) ) );
w1 = _mm256_lddqu_si256 (compressed + 17);
_mm256_storeu_si256(out + 23,
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w0 , 17) ,_mm256_slli_epi32( w1 , 15 ) ) ) );
_mm256_storeu_si256(out + 24, _mm256_and_si256 ( mask, _mm256_srli_epi32( w1 , 8) ) );
w0 = _mm256_lddqu_si256 (compressed + 18);
_mm256_storeu_si256(out + 25,
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w1 , 31) ,_mm256_slli_epi32( w0 , 1 ) ) ) );
w1 = _mm256_lddqu_si256 (compressed + 19);
_mm256_storeu_si256(out + 26,
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w0 , 22) ,_mm256_slli_epi32( w1 , 10 ) ) ) );
w0 = _mm256_lddqu_si256 (compressed + 20);
_mm256_storeu_si256(out + 27,
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w1 , 13) ,_mm256_slli_epi32( w0 , 19 ) ) ) );
_mm256_storeu_si256(out + 28, _mm256_and_si256 ( mask, _mm256_srli_epi32( w0 , 4) ) );
w1 = _mm256_lddqu_si256 (compressed + 21);
_mm256_storeu_si256(out + 29,
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w0 , 27) ,_mm256_slli_epi32( w1 , 5 ) ) ) );
w0 = _mm256_lddqu_si256 (compressed + 22);
_mm256_storeu_si256(out + 30,
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w1 , 18) ,_mm256_slli_epi32( w0 , 14 ) ) ) );
_mm256_storeu_si256(out + 31, _mm256_srli_epi32( w0 , 9) );
}
/* we packed 256 24-bit values, touching 24 256-bit words, using 384 bytes */
static void avxunpackblock24(const __m256i * compressed, uint32_t * pout) {
/* we are going to access 24 256-bit words */
__m256i w0, w1;
__m256i * out = (__m256i *) pout;
const __m256i mask = _mm256_set1_epi32(16777215);
w0 = _mm256_lddqu_si256 (compressed);
_mm256_storeu_si256(out + 0, _mm256_and_si256 ( mask, w0 ) );
w1 = _mm256_lddqu_si256 (compressed + 1);
_mm256_storeu_si256(out + 1,
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w0 , 24) ,_mm256_slli_epi32( w1 , 8 ) ) ) );
w0 = _mm256_lddqu_si256 (compressed + 2);
_mm256_storeu_si256(out + 2,
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w1 , 16) ,_mm256_slli_epi32( w0 , 16 ) ) ) );
_mm256_storeu_si256(out + 3, _mm256_srli_epi32( w0 , 8) );
w1 = _mm256_lddqu_si256 (compressed + 3);
_mm256_storeu_si256(out + 4, _mm256_and_si256 ( mask, w1 ) );
w0 = _mm256_lddqu_si256 (compressed + 4);
_mm256_storeu_si256(out + 5,
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w1 , 24) ,_mm256_slli_epi32( w0 , 8 ) ) ) );
w1 = _mm256_lddqu_si256 (compressed + 5);
_mm256_storeu_si256(out + 6,
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w0 , 16) ,_mm256_slli_epi32( w1 , 16 ) ) ) );
_mm256_storeu_si256(out + 7, _mm256_srli_epi32( w1 , 8) );
w0 = _mm256_lddqu_si256 (compressed + 6);
_mm256_storeu_si256(out + 8, _mm256_and_si256 ( mask, w0 ) );
w1 = _mm256_lddqu_si256 (compressed + 7);
_mm256_storeu_si256(out + 9,
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w0 , 24) ,_mm256_slli_epi32( w1 , 8 ) ) ) );
w0 = _mm256_lddqu_si256 (compressed + 8);
_mm256_storeu_si256(out + 10,
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w1 , 16) ,_mm256_slli_epi32( w0 , 16 ) ) ) );
_mm256_storeu_si256(out + 11, _mm256_srli_epi32( w0 , 8) );
w1 = _mm256_lddqu_si256 (compressed + 9);
_mm256_storeu_si256(out + 12, _mm256_and_si256 ( mask, w1 ) );
w0 = _mm256_lddqu_si256 (compressed + 10);
_mm256_storeu_si256(out + 13,
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w1 , 24) ,_mm256_slli_epi32( w0 , 8 ) ) ) );
w1 = _mm256_lddqu_si256 (compressed + 11);
_mm256_storeu_si256(out + 14,
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w0 , 16) ,_mm256_slli_epi32( w1 , 16 ) ) ) );
_mm256_storeu_si256(out + 15, _mm256_srli_epi32( w1 , 8) );
w0 = _mm256_lddqu_si256 (compressed + 12);
_mm256_storeu_si256(out + 16, _mm256_and_si256 ( mask, w0 ) );
w1 = _mm256_lddqu_si256 (compressed + 13);
_mm256_storeu_si256(out + 17,
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w0 , 24) ,_mm256_slli_epi32( w1 , 8 ) ) ) );
w0 = _mm256_lddqu_si256 (compressed + 14);
_mm256_storeu_si256(out + 18,
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w1 , 16) ,_mm256_slli_epi32( w0 , 16 ) ) ) );
_mm256_storeu_si256(out + 19, _mm256_srli_epi32( w0 , 8) );
w1 = _mm256_lddqu_si256 (compressed + 15);
_mm256_storeu_si256(out + 20, _mm256_and_si256 ( mask, w1 ) );
w0 = _mm256_lddqu_si256 (compressed + 16);
_mm256_storeu_si256(out + 21,
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w1 , 24) ,_mm256_slli_epi32( w0 , 8 ) ) ) );
w1 = _mm256_lddqu_si256 (compressed + 17);
_mm256_storeu_si256(out + 22,
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w0 , 16) ,_mm256_slli_epi32( w1 , 16 ) ) ) );
_mm256_storeu_si256(out + 23, _mm256_srli_epi32( w1 , 8) );
w0 = _mm256_lddqu_si256 (compressed + 18);
_mm256_storeu_si256(out + 24, _mm256_and_si256 ( mask, w0 ) );
w1 = _mm256_lddqu_si256 (compressed + 19);
_mm256_storeu_si256(out + 25,
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w0 , 24) ,_mm256_slli_epi32( w1 , 8 ) ) ) );
w0 = _mm256_lddqu_si256 (compressed + 20);
_mm256_storeu_si256(out + 26,
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w1 , 16) ,_mm256_slli_epi32( w0 , 16 ) ) ) );
_mm256_storeu_si256(out + 27, _mm256_srli_epi32( w0 , 8) );
w1 = _mm256_lddqu_si256 (compressed + 21);
_mm256_storeu_si256(out + 28, _mm256_and_si256 ( mask, w1 ) );
w0 = _mm256_lddqu_si256 (compressed + 22);
_mm256_storeu_si256(out + 29,
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w1 , 24) ,_mm256_slli_epi32( w0 , 8 ) ) ) );
w1 = _mm256_lddqu_si256 (compressed + 23);
_mm256_storeu_si256(out + 30,
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w0 , 16) ,_mm256_slli_epi32( w1 , 16 ) ) ) );
_mm256_storeu_si256(out + 31, _mm256_srli_epi32( w1 , 8) );
}
/* we packed 256 25-bit values, touching 25 256-bit words, using 400 bytes */
static void avxunpackblock25(const __m256i * compressed, uint32_t * pout) {
/* we are going to access 25 256-bit words */
__m256i w0, w1;
__m256i * out = (__m256i *) pout;
const __m256i mask = _mm256_set1_epi32(33554431);
w0 = _mm256_lddqu_si256 (compressed);
_mm256_storeu_si256(out + 0, _mm256_and_si256 ( mask, w0 ) );
w1 = _mm256_lddqu_si256 (compressed + 1);
_mm256_storeu_si256(out + 1,
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w0 , 25) ,_mm256_slli_epi32( w1 , 7 ) ) ) );
w0 = _mm256_lddqu_si256 (compressed + 2);
_mm256_storeu_si256(out + 2,
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w1 , 18) ,_mm256_slli_epi32( w0 , 14 ) ) ) );
w1 = _mm256_lddqu_si256 (compressed + 3);
_mm256_storeu_si256(out + 3,
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w0 , 11) ,_mm256_slli_epi32( w1 , 21 ) ) ) );
_mm256_storeu_si256(out + 4, _mm256_and_si256 ( mask, _mm256_srli_epi32( w1 , 4) ) );
w0 = _mm256_lddqu_si256 (compressed + 4);
_mm256_storeu_si256(out + 5,
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w1 , 29) ,_mm256_slli_epi32( w0 , 3 ) ) ) );
w1 = _mm256_lddqu_si256 (compressed + 5);
_mm256_storeu_si256(out + 6,
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w0 , 22) ,_mm256_slli_epi32( w1 , 10 ) ) ) );
w0 = _mm256_lddqu_si256 (compressed + 6);
_mm256_storeu_si256(out + 7,
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w1 , 15) ,_mm256_slli_epi32( w0 , 17 ) ) ) );
w1 = _mm256_lddqu_si256 (compressed + 7);
_mm256_storeu_si256(out + 8,
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w0 , 8) ,_mm256_slli_epi32( w1 , 24 ) ) ) );
_mm256_storeu_si256(out + 9, _mm256_and_si256 ( mask, _mm256_srli_epi32( w1 , 1) ) );
w0 = _mm256_lddqu_si256 (compressed + 8);
_mm256_storeu_si256(out + 10,
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w1 , 26) ,_mm256_slli_epi32( w0 , 6 ) ) ) );
w1 = _mm256_lddqu_si256 (compressed + 9);
_mm256_storeu_si256(out + 11,
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w0 , 19) ,_mm256_slli_epi32( w1 , 13 ) ) ) );
w0 = _mm256_lddqu_si256 (compressed + 10);
_mm256_storeu_si256(out + 12,
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w1 , 12) ,_mm256_slli_epi32( w0 , 20 ) ) ) );
_mm256_storeu_si256(out + 13, _mm256_and_si256 ( mask, _mm256_srli_epi32( w0 , 5) ) );
w1 = _mm256_lddqu_si256 (compressed + 11);
_mm256_storeu_si256(out + 14,
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w0 , 30) ,_mm256_slli_epi32( w1 , 2 ) ) ) );
w0 = _mm256_lddqu_si256 (compressed + 12);
_mm256_storeu_si256(out + 15,
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w1 , 23) ,_mm256_slli_epi32( w0 , 9 ) ) ) );
w1 = _mm256_lddqu_si256 (compressed + 13);
_mm256_storeu_si256(out + 16,
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w0 , 16) ,_mm256_slli_epi32( w1 , 16 ) ) ) );
w0 = _mm256_lddqu_si256 (compressed + 14);
_mm256_storeu_si256(out + 17,
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w1 , 9) ,_mm256_slli_epi32( w0 , 23 ) ) ) );
_mm256_storeu_si256(out + 18, _mm256_and_si256 ( mask, _mm256_srli_epi32( w0 , 2) ) );
w1 = _mm256_lddqu_si256 (compressed + 15);
_mm256_storeu_si256(out + 19,
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w0 , 27) ,_mm256_slli_epi32( w1 , 5 ) ) ) );
w0 = _mm256_lddqu_si256 (compressed + 16);
_mm256_storeu_si256(out + 20,
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w1 , 20) ,_mm256_slli_epi32( w0 , 12 ) ) ) );
w1 = _mm256_lddqu_si256 (compressed + 17);
_mm256_storeu_si256(out + 21,
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w0 , 13) ,_mm256_slli_epi32( w1 , 19 ) ) ) );
_mm256_storeu_si256(out + 22, _mm256_and_si256 ( mask, _mm256_srli_epi32( w1 , 6) ) );
w0 = _mm256_lddqu_si256 (compressed + 18);
_mm256_storeu_si256(out + 23,
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w1 , 31) ,_mm256_slli_epi32( w0 , 1 ) ) ) );
w1 = _mm256_lddqu_si256 (compressed + 19);
_mm256_storeu_si256(out + 24,
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w0 , 24) ,_mm256_slli_epi32( w1 , 8 ) ) ) );
w0 = _mm256_lddqu_si256 (compressed + 20);
_mm256_storeu_si256(out + 25,
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w1 , 17) ,_mm256_slli_epi32( w0 , 15 ) ) ) );
w1 = _mm256_lddqu_si256 (compressed + 21);
_mm256_storeu_si256(out + 26,
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w0 , 10) ,_mm256_slli_epi32( w1 , 22 ) ) ) );
_mm256_storeu_si256(out + 27, _mm256_and_si256 ( mask, _mm256_srli_epi32( w1 , 3) ) );
w0 = _mm256_lddqu_si256 (compressed + 22);
_mm256_storeu_si256(out + 28,
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w1 , 28) ,_mm256_slli_epi32( w0 , 4 ) ) ) );
w1 = _mm256_lddqu_si256 (compressed + 23);
_mm256_storeu_si256(out + 29,
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w0 , 21) ,_mm256_slli_epi32( w1 , 11 ) ) ) );
w0 = _mm256_lddqu_si256 (compressed + 24);
_mm256_storeu_si256(out + 30,
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w1 , 14) ,_mm256_slli_epi32( w0 , 18 ) ) ) );
_mm256_storeu_si256(out + 31, _mm256_srli_epi32( w0 , 7) );
}
/* we packed 256 26-bit values, touching 26 256-bit words, using 416 bytes */
static void avxunpackblock26(const __m256i * compressed, uint32_t * pout) {
/* we are going to access 26 256-bit words */
__m256i w0, w1;
__m256i * out = (__m256i *) pout;
const __m256i mask = _mm256_set1_epi32(67108863);
w0 = _mm256_lddqu_si256 (compressed);
_mm256_storeu_si256(out + 0, _mm256_and_si256 ( mask, w0 ) );
w1 = _mm256_lddqu_si256 (compressed + 1);
_mm256_storeu_si256(out + 1,
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w0 , 26) ,_mm256_slli_epi32( w1 , 6 ) ) ) );
w0 = _mm256_lddqu_si256 (compressed + 2);
_mm256_storeu_si256(out + 2,
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w1 , 20) ,_mm256_slli_epi32( w0 , 12 ) ) ) );
w1 = _mm256_lddqu_si256 (compressed + 3);
_mm256_storeu_si256(out + 3,
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w0 , 14) ,_mm256_slli_epi32( w1 , 18 ) ) ) );
w0 = _mm256_lddqu_si256 (compressed + 4);
_mm256_storeu_si256(out + 4,
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w1 , 8) ,_mm256_slli_epi32( w0 , 24 ) ) ) );
_mm256_storeu_si256(out + 5, _mm256_and_si256 ( mask, _mm256_srli_epi32( w0 , 2) ) );
w1 = _mm256_lddqu_si256 (compressed + 5);
_mm256_storeu_si256(out + 6,
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w0 , 28) ,_mm256_slli_epi32( w1 , 4 ) ) ) );
w0 = _mm256_lddqu_si256 (compressed + 6);
_mm256_storeu_si256(out + 7,
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w1 , 22) ,_mm256_slli_epi32( w0 , 10 ) ) ) );
w1 = _mm256_lddqu_si256 (compressed + 7);
_mm256_storeu_si256(out + 8,
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w0 , 16) ,_mm256_slli_epi32( w1 , 16 ) ) ) );
w0 = _mm256_lddqu_si256 (compressed + 8);
_mm256_storeu_si256(out + 9,
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w1 , 10) ,_mm256_slli_epi32( w0 , 22 ) ) ) );
_mm256_storeu_si256(out + 10, _mm256_and_si256 ( mask, _mm256_srli_epi32( w0 , 4) ) );
w1 = _mm256_lddqu_si256 (compressed + 9);
_mm256_storeu_si256(out + 11,
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w0 , 30) ,_mm256_slli_epi32( w1 , 2 ) ) ) );
w0 = _mm256_lddqu_si256 (compressed + 10);
_mm256_storeu_si256(out + 12,
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w1 , 24) ,_mm256_slli_epi32( w0 , 8 ) ) ) );
w1 = _mm256_lddqu_si256 (compressed + 11);
_mm256_storeu_si256(out + 13,
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w0 , 18) ,_mm256_slli_epi32( w1 , 14 ) ) ) );
w0 = _mm256_lddqu_si256 (compressed + 12);
_mm256_storeu_si256(out + 14,
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w1 , 12) ,_mm256_slli_epi32( w0 , 20 ) ) ) );
_mm256_storeu_si256(out + 15, _mm256_srli_epi32( w0 , 6) );
w1 = _mm256_lddqu_si256 (compressed + 13);
_mm256_storeu_si256(out + 16, _mm256_and_si256 ( mask, w1 ) );
w0 = _mm256_lddqu_si256 (compressed + 14);
_mm256_storeu_si256(out + 17,
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w1 , 26) ,_mm256_slli_epi32( w0 , 6 ) ) ) );
w1 = _mm256_lddqu_si256 (compressed + 15);
_mm256_storeu_si256(out + 18,
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w0 , 20) ,_mm256_slli_epi32( w1 , 12 ) ) ) );
w0 = _mm256_lddqu_si256 (compressed + 16);
_mm256_storeu_si256(out + 19,
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w1 , 14) ,_mm256_slli_epi32( w0 , 18 ) ) ) );
w1 = _mm256_lddqu_si256 (compressed + 17);
_mm256_storeu_si256(out + 20,
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w0 , 8) ,_mm256_slli_epi32( w1 , 24 ) ) ) );
_mm256_storeu_si256(out + 21, _mm256_and_si256 ( mask, _mm256_srli_epi32( w1 , 2) ) );
w0 = _mm256_lddqu_si256 (compressed + 18);
_mm256_storeu_si256(out + 22,
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w1 , 28) ,_mm256_slli_epi32( w0 , 4 ) ) ) );
w1 = _mm256_lddqu_si256 (compressed + 19);
_mm256_storeu_si256(out + 23,
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w0 , 22) ,_mm256_slli_epi32( w1 , 10 ) ) ) );
w0 = _mm256_lddqu_si256 (compressed + 20);
_mm256_storeu_si256(out + 24,
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w1 , 16) ,_mm256_slli_epi32( w0 , 16 ) ) ) );
w1 = _mm256_lddqu_si256 (compressed + 21);
_mm256_storeu_si256(out + 25,
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w0 , 10) ,_mm256_slli_epi32( w1 , 22 ) ) ) );
_mm256_storeu_si256(out + 26, _mm256_and_si256 ( mask, _mm256_srli_epi32( w1 , 4) ) );
w0 = _mm256_lddqu_si256 (compressed + 22);
_mm256_storeu_si256(out + 27,
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w1 , 30) ,_mm256_slli_epi32( w0 , 2 ) ) ) );
w1 = _mm256_lddqu_si256 (compressed + 23);
_mm256_storeu_si256(out + 28,
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w0 , 24) ,_mm256_slli_epi32( w1 , 8 ) ) ) );
w0 = _mm256_lddqu_si256 (compressed + 24);
_mm256_storeu_si256(out + 29,
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w1 , 18) ,_mm256_slli_epi32( w0 , 14 ) ) ) );
w1 = _mm256_lddqu_si256 (compressed + 25);
_mm256_storeu_si256(out + 30,
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w0 , 12) ,_mm256_slli_epi32( w1 , 20 ) ) ) );
_mm256_storeu_si256(out + 31, _mm256_srli_epi32( w1 , 6) );
}
/* we packed 256 27-bit values, touching 27 256-bit words, using 432 bytes */
static void avxunpackblock27(const __m256i * compressed, uint32_t * pout) {
/* we are going to access 27 256-bit words */
__m256i w0, w1;
__m256i * out = (__m256i *) pout;
const __m256i mask = _mm256_set1_epi32(134217727);
w0 = _mm256_lddqu_si256 (compressed);
_mm256_storeu_si256(out + 0, _mm256_and_si256 ( mask, w0 ) );
w1 = _mm256_lddqu_si256 (compressed + 1);
_mm256_storeu_si256(out + 1,
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w0 , 27) ,_mm256_slli_epi32( w1 , 5 ) ) ) );
w0 = _mm256_lddqu_si256 (compressed + 2);
_mm256_storeu_si256(out + 2,
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w1 , 22) ,_mm256_slli_epi32( w0 , 10 ) ) ) );
w1 = _mm256_lddqu_si256 (compressed + 3);
_mm256_storeu_si256(out + 3,
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w0 , 17) ,_mm256_slli_epi32( w1 , 15 ) ) ) );
w0 = _mm256_lddqu_si256 (compressed + 4);
_mm256_storeu_si256(out + 4,
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w1 , 12) ,_mm256_slli_epi32( w0 , 20 ) ) ) );
w1 = _mm256_lddqu_si256 (compressed + 5);
_mm256_storeu_si256(out + 5,
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w0 , 7) ,_mm256_slli_epi32( w1 , 25 ) ) ) );
_mm256_storeu_si256(out + 6, _mm256_and_si256 ( mask, _mm256_srli_epi32( w1 , 2) ) );
w0 = _mm256_lddqu_si256 (compressed + 6);
_mm256_storeu_si256(out + 7,
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w1 , 29) ,_mm256_slli_epi32( w0 , 3 ) ) ) );
w1 = _mm256_lddqu_si256 (compressed + 7);
_mm256_storeu_si256(out + 8,
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w0 , 24) ,_mm256_slli_epi32( w1 , 8 ) ) ) );
w0 = _mm256_lddqu_si256 (compressed + 8);
_mm256_storeu_si256(out + 9,
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w1 , 19) ,_mm256_slli_epi32( w0 , 13 ) ) ) );
w1 = _mm256_lddqu_si256 (compressed + 9);
_mm256_storeu_si256(out + 10,
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w0 , 14) ,_mm256_slli_epi32( w1 , 18 ) ) ) );
w0 = _mm256_lddqu_si256 (compressed + 10);
_mm256_storeu_si256(out + 11,
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w1 , 9) ,_mm256_slli_epi32( w0 , 23 ) ) ) );
_mm256_storeu_si256(out + 12, _mm256_and_si256 ( mask, _mm256_srli_epi32( w0 , 4) ) );
w1 = _mm256_lddqu_si256 (compressed + 11);
_mm256_storeu_si256(out + 13,
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w0 , 31) ,_mm256_slli_epi32( w1 , 1 ) ) ) );
w0 = _mm256_lddqu_si256 (compressed + 12);
_mm256_storeu_si256(out + 14,
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w1 , 26) ,_mm256_slli_epi32( w0 , 6 ) ) ) );
w1 = _mm256_lddqu_si256 (compressed + 13);
_mm256_storeu_si256(out + 15,
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w0 , 21) ,_mm256_slli_epi32( w1 , 11 ) ) ) );
w0 = _mm256_lddqu_si256 (compressed + 14);
_mm256_storeu_si256(out + 16,
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w1 , 16) ,_mm256_slli_epi32( w0 , 16 ) ) ) );
w1 = _mm256_lddqu_si256 (compressed + 15);
_mm256_storeu_si256(out + 17,
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w0 , 11) ,_mm256_slli_epi32( w1 , 21 ) ) ) );
w0 = _mm256_lddqu_si256 (compressed + 16);
_mm256_storeu_si256(out + 18,
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w1 , 6) ,_mm256_slli_epi32( w0 , 26 ) ) ) );
_mm256_storeu_si256(out + 19, _mm256_and_si256 ( mask, _mm256_srli_epi32( w0 , 1) ) );
w1 = _mm256_lddqu_si256 (compressed + 17);
_mm256_storeu_si256(out + 20,
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w0 , 28) ,_mm256_slli_epi32( w1 , 4 ) ) ) );
w0 = _mm256_lddqu_si256 (compressed + 18);
_mm256_storeu_si256(out + 21,
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w1 , 23) ,_mm256_slli_epi32( w0 , 9 ) ) ) );
w1 = _mm256_lddqu_si256 (compressed + 19);
_mm256_storeu_si256(out + 22,
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w0 , 18) ,_mm256_slli_epi32( w1 , 14 ) ) ) );
w0 = _mm256_lddqu_si256 (compressed + 20);
_mm256_storeu_si256(out + 23,
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w1 , 13) ,_mm256_slli_epi32( w0 , 19 ) ) ) );
w1 = _mm256_lddqu_si256 (compressed + 21);
_mm256_storeu_si256(out + 24,
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w0 , 8) ,_mm256_slli_epi32( w1 , 24 ) ) ) );
_mm256_storeu_si256(out + 25, _mm256_and_si256 ( mask, _mm256_srli_epi32( w1 , 3) ) );
w0 = _mm256_lddqu_si256 (compressed + 22);
_mm256_storeu_si256(out + 26,
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w1 , 30) ,_mm256_slli_epi32( w0 , 2 ) ) ) );
w1 = _mm256_lddqu_si256 (compressed + 23);
_mm256_storeu_si256(out + 27,
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w0 , 25) ,_mm256_slli_epi32( w1 , 7 ) ) ) );
w0 = _mm256_lddqu_si256 (compressed + 24);
_mm256_storeu_si256(out + 28,
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w1 , 20) ,_mm256_slli_epi32( w0 , 12 ) ) ) );
w1 = _mm256_lddqu_si256 (compressed + 25);
_mm256_storeu_si256(out + 29,
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w0 , 15) ,_mm256_slli_epi32( w1 , 17 ) ) ) );
w0 = _mm256_lddqu_si256 (compressed + 26);
_mm256_storeu_si256(out + 30,
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w1 , 10) ,_mm256_slli_epi32( w0 , 22 ) ) ) );
_mm256_storeu_si256(out + 31, _mm256_srli_epi32( w0 , 5) );
}
/* we packed 256 28-bit values, touching 28 256-bit words, using 448 bytes */
static void avxunpackblock28(const __m256i * compressed, uint32_t * pout) {
/* we are going to access 28 256-bit words */
__m256i w0, w1;
__m256i * out = (__m256i *) pout;
const __m256i mask = _mm256_set1_epi32(268435455);
w0 = _mm256_lddqu_si256 (compressed);
_mm256_storeu_si256(out + 0, _mm256_and_si256 ( mask, w0 ) );
w1 = _mm256_lddqu_si256 (compressed + 1);
_mm256_storeu_si256(out + 1,
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w0 , 28) ,_mm256_slli_epi32( w1 , 4 ) ) ) );
w0 = _mm256_lddqu_si256 (compressed + 2);
_mm256_storeu_si256(out + 2,
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w1 , 24) ,_mm256_slli_epi32( w0 , 8 ) ) ) );
w1 = _mm256_lddqu_si256 (compressed + 3);
_mm256_storeu_si256(out + 3,
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w0 , 20) ,_mm256_slli_epi32( w1 , 12 ) ) ) );
w0 = _mm256_lddqu_si256 (compressed + 4);
_mm256_storeu_si256(out + 4,
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w1 , 16) ,_mm256_slli_epi32( w0 , 16 ) ) ) );
w1 = _mm256_lddqu_si256 (compressed + 5);
_mm256_storeu_si256(out + 5,
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w0 , 12) ,_mm256_slli_epi32( w1 , 20 ) ) ) );
w0 = _mm256_lddqu_si256 (compressed + 6);
_mm256_storeu_si256(out + 6,
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w1 , 8) ,_mm256_slli_epi32( w0 , 24 ) ) ) );
_mm256_storeu_si256(out + 7, _mm256_srli_epi32( w0 , 4) );
w1 = _mm256_lddqu_si256 (compressed + 7);
_mm256_storeu_si256(out + 8, _mm256_and_si256 ( mask, w1 ) );
w0 = _mm256_lddqu_si256 (compressed + 8);
_mm256_storeu_si256(out + 9,
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w1 , 28) ,_mm256_slli_epi32( w0 , 4 ) ) ) );
w1 = _mm256_lddqu_si256 (compressed + 9);
_mm256_storeu_si256(out + 10,
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w0 , 24) ,_mm256_slli_epi32( w1 , 8 ) ) ) );
w0 = _mm256_lddqu_si256 (compressed + 10);
_mm256_storeu_si256(out + 11,
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w1 , 20) ,_mm256_slli_epi32( w0 , 12 ) ) ) );
w1 = _mm256_lddqu_si256 (compressed + 11);
_mm256_storeu_si256(out + 12,
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w0 , 16) ,_mm256_slli_epi32( w1 , 16 ) ) ) );
w0 = _mm256_lddqu_si256 (compressed + 12);
_mm256_storeu_si256(out + 13,
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w1 , 12) ,_mm256_slli_epi32( w0 , 20 ) ) ) );
w1 = _mm256_lddqu_si256 (compressed + 13);
_mm256_storeu_si256(out + 14,
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w0 , 8) ,_mm256_slli_epi32( w1 , 24 ) ) ) );
_mm256_storeu_si256(out + 15, _mm256_srli_epi32( w1 , 4) );
w0 = _mm256_lddqu_si256 (compressed + 14);
_mm256_storeu_si256(out + 16, _mm256_and_si256 ( mask, w0 ) );
w1 = _mm256_lddqu_si256 (compressed + 15);
_mm256_storeu_si256(out + 17,
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w0 , 28) ,_mm256_slli_epi32( w1 , 4 ) ) ) );
w0 = _mm256_lddqu_si256 (compressed + 16);
_mm256_storeu_si256(out + 18,
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w1 , 24) ,_mm256_slli_epi32( w0 , 8 ) ) ) );
w1 = _mm256_lddqu_si256 (compressed + 17);
_mm256_storeu_si256(out + 19,
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w0 , 20) ,_mm256_slli_epi32( w1 , 12 ) ) ) );
w0 = _mm256_lddqu_si256 (compressed + 18);
_mm256_storeu_si256(out + 20,
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w1 , 16) ,_mm256_slli_epi32( w0 , 16 ) ) ) );
w1 = _mm256_lddqu_si256 (compressed + 19);
_mm256_storeu_si256(out + 21,
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w0 , 12) ,_mm256_slli_epi32( w1 , 20 ) ) ) );
w0 = _mm256_lddqu_si256 (compressed + 20);
_mm256_storeu_si256(out + 22,
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w1 , 8) ,_mm256_slli_epi32( w0 , 24 ) ) ) );
_mm256_storeu_si256(out + 23, _mm256_srli_epi32( w0 , 4) );
w1 = _mm256_lddqu_si256 (compressed + 21);
_mm256_storeu_si256(out + 24, _mm256_and_si256 ( mask, w1 ) );
w0 = _mm256_lddqu_si256 (compressed + 22);
_mm256_storeu_si256(out + 25,
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w1 , 28) ,_mm256_slli_epi32( w0 , 4 ) ) ) );
w1 = _mm256_lddqu_si256 (compressed + 23);
_mm256_storeu_si256(out + 26,
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w0 , 24) ,_mm256_slli_epi32( w1 , 8 ) ) ) );
w0 = _mm256_lddqu_si256 (compressed + 24);
_mm256_storeu_si256(out + 27,
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w1 , 20) ,_mm256_slli_epi32( w0 , 12 ) ) ) );
w1 = _mm256_lddqu_si256 (compressed + 25);
_mm256_storeu_si256(out + 28,
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w0 , 16) ,_mm256_slli_epi32( w1 , 16 ) ) ) );
w0 = _mm256_lddqu_si256 (compressed + 26);
_mm256_storeu_si256(out + 29,
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w1 , 12) ,_mm256_slli_epi32( w0 , 20 ) ) ) );
w1 = _mm256_lddqu_si256 (compressed + 27);
_mm256_storeu_si256(out + 30,
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w0 , 8) ,_mm256_slli_epi32( w1 , 24 ) ) ) );
_mm256_storeu_si256(out + 31, _mm256_srli_epi32( w1 , 4) );
}
/* we packed 256 29-bit values, touching 29 256-bit words, using 464 bytes */
static void avxunpackblock29(const __m256i * compressed, uint32_t * pout) {
/* we are going to access 29 256-bit words */
__m256i w0, w1;
__m256i * out = (__m256i *) pout;
const __m256i mask = _mm256_set1_epi32(536870911);
w0 = _mm256_lddqu_si256 (compressed);
_mm256_storeu_si256(out + 0, _mm256_and_si256 ( mask, w0 ) );
w1 = _mm256_lddqu_si256 (compressed + 1);
_mm256_storeu_si256(out + 1,
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w0 , 29) ,_mm256_slli_epi32( w1 , 3 ) ) ) );
w0 = _mm256_lddqu_si256 (compressed + 2);
_mm256_storeu_si256(out + 2,
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w1 , 26) ,_mm256_slli_epi32( w0 , 6 ) ) ) );
w1 = _mm256_lddqu_si256 (compressed + 3);
_mm256_storeu_si256(out + 3,
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w0 , 23) ,_mm256_slli_epi32( w1 , 9 ) ) ) );
w0 = _mm256_lddqu_si256 (compressed + 4);
_mm256_storeu_si256(out + 4,
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w1 , 20) ,_mm256_slli_epi32( w0 , 12 ) ) ) );
w1 = _mm256_lddqu_si256 (compressed + 5);
_mm256_storeu_si256(out + 5,
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w0 , 17) ,_mm256_slli_epi32( w1 , 15 ) ) ) );
w0 = _mm256_lddqu_si256 (compressed + 6);
_mm256_storeu_si256(out + 6,
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w1 , 14) ,_mm256_slli_epi32( w0 , 18 ) ) ) );
w1 = _mm256_lddqu_si256 (compressed + 7);
_mm256_storeu_si256(out + 7,
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w0 , 11) ,_mm256_slli_epi32( w1 , 21 ) ) ) );
w0 = _mm256_lddqu_si256 (compressed + 8);
_mm256_storeu_si256(out + 8,
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w1 , 8) ,_mm256_slli_epi32( w0 , 24 ) ) ) );
w1 = _mm256_lddqu_si256 (compressed + 9);
_mm256_storeu_si256(out + 9,
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w0 , 5) ,_mm256_slli_epi32( w1 , 27 ) ) ) );
_mm256_storeu_si256(out + 10, _mm256_and_si256 ( mask, _mm256_srli_epi32( w1 , 2) ) );
w0 = _mm256_lddqu_si256 (compressed + 10);
_mm256_storeu_si256(out + 11,
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w1 , 31) ,_mm256_slli_epi32( w0 , 1 ) ) ) );
w1 = _mm256_lddqu_si256 (compressed + 11);
_mm256_storeu_si256(out + 12,
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w0 , 28) ,_mm256_slli_epi32( w1 , 4 ) ) ) );
w0 = _mm256_lddqu_si256 (compressed + 12);
_mm256_storeu_si256(out + 13,
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w1 , 25) ,_mm256_slli_epi32( w0 , 7 ) ) ) );
w1 = _mm256_lddqu_si256 (compressed + 13);
_mm256_storeu_si256(out + 14,
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w0 , 22) ,_mm256_slli_epi32( w1 , 10 ) ) ) );
w0 = _mm256_lddqu_si256 (compressed + 14);
_mm256_storeu_si256(out + 15,
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w1 , 19) ,_mm256_slli_epi32( w0 , 13 ) ) ) );
w1 = _mm256_lddqu_si256 (compressed + 15);
_mm256_storeu_si256(out + 16,
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w0 , 16) ,_mm256_slli_epi32( w1 , 16 ) ) ) );
w0 = _mm256_lddqu_si256 (compressed + 16);
_mm256_storeu_si256(out + 17,
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w1 , 13) ,_mm256_slli_epi32( w0 , 19 ) ) ) );
w1 = _mm256_lddqu_si256 (compressed + 17);
_mm256_storeu_si256(out + 18,
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w0 , 10) ,_mm256_slli_epi32( w1 , 22 ) ) ) );
w0 = _mm256_lddqu_si256 (compressed + 18);
_mm256_storeu_si256(out + 19,
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w1 , 7) ,_mm256_slli_epi32( w0 , 25 ) ) ) );
w1 = _mm256_lddqu_si256 (compressed + 19);
_mm256_storeu_si256(out + 20,
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w0 , 4) ,_mm256_slli_epi32( w1 , 28 ) ) ) );
_mm256_storeu_si256(out + 21, _mm256_and_si256 ( mask, _mm256_srli_epi32( w1 , 1) ) );
w0 = _mm256_lddqu_si256 (compressed + 20);
_mm256_storeu_si256(out + 22,
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w1 , 30) ,_mm256_slli_epi32( w0 , 2 ) ) ) );
w1 = _mm256_lddqu_si256 (compressed + 21);
_mm256_storeu_si256(out + 23,
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w0 , 27) ,_mm256_slli_epi32( w1 , 5 ) ) ) );
w0 = _mm256_lddqu_si256 (compressed + 22);
_mm256_storeu_si256(out + 24,
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w1 , 24) ,_mm256_slli_epi32( w0 , 8 ) ) ) );
w1 = _mm256_lddqu_si256 (compressed + 23);
_mm256_storeu_si256(out + 25,
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w0 , 21) ,_mm256_slli_epi32( w1 , 11 ) ) ) );
w0 = _mm256_lddqu_si256 (compressed + 24);
_mm256_storeu_si256(out + 26,
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w1 , 18) ,_mm256_slli_epi32( w0 , 14 ) ) ) );
w1 = _mm256_lddqu_si256 (compressed + 25);
_mm256_storeu_si256(out + 27,
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w0 , 15) ,_mm256_slli_epi32( w1 , 17 ) ) ) );
w0 = _mm256_lddqu_si256 (compressed + 26);
_mm256_storeu_si256(out + 28,
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w1 , 12) ,_mm256_slli_epi32( w0 , 20 ) ) ) );
w1 = _mm256_lddqu_si256 (compressed + 27);
_mm256_storeu_si256(out + 29,
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w0 , 9) ,_mm256_slli_epi32( w1 , 23 ) ) ) );
w0 = _mm256_lddqu_si256 (compressed + 28);
_mm256_storeu_si256(out + 30,
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w1 , 6) ,_mm256_slli_epi32( w0 , 26 ) ) ) );
_mm256_storeu_si256(out + 31, _mm256_srli_epi32( w0 , 3) );
}
/* we packed 256 30-bit values, touching 30 256-bit words, using 480 bytes */
static void avxunpackblock30(const __m256i * compressed, uint32_t * pout) {
/* we are going to access 30 256-bit words */
__m256i w0, w1;
__m256i * out = (__m256i *) pout;
const __m256i mask = _mm256_set1_epi32(1073741823);
w0 = _mm256_lddqu_si256 (compressed);
_mm256_storeu_si256(out + 0, _mm256_and_si256 ( mask, w0 ) );
w1 = _mm256_lddqu_si256 (compressed + 1);
_mm256_storeu_si256(out + 1,
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w0 , 30) ,_mm256_slli_epi32( w1 , 2 ) ) ) );
w0 = _mm256_lddqu_si256 (compressed + 2);
_mm256_storeu_si256(out + 2,
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w1 , 28) ,_mm256_slli_epi32( w0 , 4 ) ) ) );
w1 = _mm256_lddqu_si256 (compressed + 3);
_mm256_storeu_si256(out + 3,
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w0 , 26) ,_mm256_slli_epi32( w1 , 6 ) ) ) );
w0 = _mm256_lddqu_si256 (compressed + 4);
_mm256_storeu_si256(out + 4,
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w1 , 24) ,_mm256_slli_epi32( w0 , 8 ) ) ) );
w1 = _mm256_lddqu_si256 (compressed + 5);
_mm256_storeu_si256(out + 5,
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w0 , 22) ,_mm256_slli_epi32( w1 , 10 ) ) ) );
w0 = _mm256_lddqu_si256 (compressed + 6);
_mm256_storeu_si256(out + 6,
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w1 , 20) ,_mm256_slli_epi32( w0 , 12 ) ) ) );
w1 = _mm256_lddqu_si256 (compressed + 7);
_mm256_storeu_si256(out + 7,
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w0 , 18) ,_mm256_slli_epi32( w1 , 14 ) ) ) );
w0 = _mm256_lddqu_si256 (compressed + 8);
_mm256_storeu_si256(out + 8,
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w1 , 16) ,_mm256_slli_epi32( w0 , 16 ) ) ) );
w1 = _mm256_lddqu_si256 (compressed + 9);
_mm256_storeu_si256(out + 9,
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w0 , 14) ,_mm256_slli_epi32( w1 , 18 ) ) ) );
w0 = _mm256_lddqu_si256 (compressed + 10);
_mm256_storeu_si256(out + 10,
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w1 , 12) ,_mm256_slli_epi32( w0 , 20 ) ) ) );
w1 = _mm256_lddqu_si256 (compressed + 11);
_mm256_storeu_si256(out + 11,
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w0 , 10) ,_mm256_slli_epi32( w1 , 22 ) ) ) );
w0 = _mm256_lddqu_si256 (compressed + 12);
_mm256_storeu_si256(out + 12,
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w1 , 8) ,_mm256_slli_epi32( w0 , 24 ) ) ) );
w1 = _mm256_lddqu_si256 (compressed + 13);
_mm256_storeu_si256(out + 13,
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w0 , 6) ,_mm256_slli_epi32( w1 , 26 ) ) ) );
w0 = _mm256_lddqu_si256 (compressed + 14);
_mm256_storeu_si256(out + 14,
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w1 , 4) ,_mm256_slli_epi32( w0 , 28 ) ) ) );
_mm256_storeu_si256(out + 15, _mm256_srli_epi32( w0 , 2) );
w1 = _mm256_lddqu_si256 (compressed + 15);
_mm256_storeu_si256(out + 16, _mm256_and_si256 ( mask, w1 ) );
w0 = _mm256_lddqu_si256 (compressed + 16);
_mm256_storeu_si256(out + 17,
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w1 , 30) ,_mm256_slli_epi32( w0 , 2 ) ) ) );
w1 = _mm256_lddqu_si256 (compressed + 17);
_mm256_storeu_si256(out + 18,
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w0 , 28) ,_mm256_slli_epi32( w1 , 4 ) ) ) );
w0 = _mm256_lddqu_si256 (compressed + 18);
_mm256_storeu_si256(out + 19,
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w1 , 26) ,_mm256_slli_epi32( w0 , 6 ) ) ) );
w1 = _mm256_lddqu_si256 (compressed + 19);
_mm256_storeu_si256(out + 20,
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w0 , 24) ,_mm256_slli_epi32( w1 , 8 ) ) ) );
w0 = _mm256_lddqu_si256 (compressed + 20);
_mm256_storeu_si256(out + 21,
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w1 , 22) ,_mm256_slli_epi32( w0 , 10 ) ) ) );
w1 = _mm256_lddqu_si256 (compressed + 21);
_mm256_storeu_si256(out + 22,
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w0 , 20) ,_mm256_slli_epi32( w1 , 12 ) ) ) );
w0 = _mm256_lddqu_si256 (compressed + 22);
_mm256_storeu_si256(out + 23,
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w1 , 18) ,_mm256_slli_epi32( w0 , 14 ) ) ) );
w1 = _mm256_lddqu_si256 (compressed + 23);
_mm256_storeu_si256(out + 24,
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w0 , 16) ,_mm256_slli_epi32( w1 , 16 ) ) ) );
w0 = _mm256_lddqu_si256 (compressed + 24);
_mm256_storeu_si256(out + 25,
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w1 , 14) ,_mm256_slli_epi32( w0 , 18 ) ) ) );
w1 = _mm256_lddqu_si256 (compressed + 25);
_mm256_storeu_si256(out + 26,
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w0 , 12) ,_mm256_slli_epi32( w1 , 20 ) ) ) );
w0 = _mm256_lddqu_si256 (compressed + 26);
_mm256_storeu_si256(out + 27,
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w1 , 10) ,_mm256_slli_epi32( w0 , 22 ) ) ) );
w1 = _mm256_lddqu_si256 (compressed + 27);
_mm256_storeu_si256(out + 28,
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w0 , 8) ,_mm256_slli_epi32( w1 , 24 ) ) ) );
w0 = _mm256_lddqu_si256 (compressed + 28);
_mm256_storeu_si256(out + 29,
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w1 , 6) ,_mm256_slli_epi32( w0 , 26 ) ) ) );
w1 = _mm256_lddqu_si256 (compressed + 29);
_mm256_storeu_si256(out + 30,
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w0 , 4) ,_mm256_slli_epi32( w1 , 28 ) ) ) );
_mm256_storeu_si256(out + 31, _mm256_srli_epi32( w1 , 2) );
}
/* we packed 256 31-bit values, touching 31 256-bit words, using 496 bytes */
static void avxunpackblock31(const __m256i * compressed, uint32_t * pout) {
/* we are going to access 31 256-bit words */
__m256i w0, w1;
__m256i * out = (__m256i *) pout;
const __m256i mask = _mm256_set1_epi32(2147483647);
w0 = _mm256_lddqu_si256 (compressed);
_mm256_storeu_si256(out + 0, _mm256_and_si256 ( mask, w0 ) );
w1 = _mm256_lddqu_si256 (compressed + 1);
_mm256_storeu_si256(out + 1,
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w0 , 31) ,_mm256_slli_epi32( w1 , 1 ) ) ) );
w0 = _mm256_lddqu_si256 (compressed + 2);
_mm256_storeu_si256(out + 2,
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w1 , 30) ,_mm256_slli_epi32( w0 , 2 ) ) ) );
w1 = _mm256_lddqu_si256 (compressed + 3);
_mm256_storeu_si256(out + 3,
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w0 , 29) ,_mm256_slli_epi32( w1 , 3 ) ) ) );
w0 = _mm256_lddqu_si256 (compressed + 4);
_mm256_storeu_si256(out + 4,
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w1 , 28) ,_mm256_slli_epi32( w0 , 4 ) ) ) );
w1 = _mm256_lddqu_si256 (compressed + 5);
_mm256_storeu_si256(out + 5,
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w0 , 27) ,_mm256_slli_epi32( w1 , 5 ) ) ) );
w0 = _mm256_lddqu_si256 (compressed + 6);
_mm256_storeu_si256(out + 6,
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w1 , 26) ,_mm256_slli_epi32( w0 , 6 ) ) ) );
w1 = _mm256_lddqu_si256 (compressed + 7);
_mm256_storeu_si256(out + 7,
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w0 , 25) ,_mm256_slli_epi32( w1 , 7 ) ) ) );
w0 = _mm256_lddqu_si256 (compressed + 8);
_mm256_storeu_si256(out + 8,
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w1 , 24) ,_mm256_slli_epi32( w0 , 8 ) ) ) );
w1 = _mm256_lddqu_si256 (compressed + 9);
_mm256_storeu_si256(out + 9,
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w0 , 23) ,_mm256_slli_epi32( w1 , 9 ) ) ) );
w0 = _mm256_lddqu_si256 (compressed + 10);
_mm256_storeu_si256(out + 10,
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w1 , 22) ,_mm256_slli_epi32( w0 , 10 ) ) ) );
w1 = _mm256_lddqu_si256 (compressed + 11);
_mm256_storeu_si256(out + 11,
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w0 , 21) ,_mm256_slli_epi32( w1 , 11 ) ) ) );
w0 = _mm256_lddqu_si256 (compressed + 12);
_mm256_storeu_si256(out + 12,
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w1 , 20) ,_mm256_slli_epi32( w0 , 12 ) ) ) );
w1 = _mm256_lddqu_si256 (compressed + 13);
_mm256_storeu_si256(out + 13,
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w0 , 19) ,_mm256_slli_epi32( w1 , 13 ) ) ) );
w0 = _mm256_lddqu_si256 (compressed + 14);
_mm256_storeu_si256(out + 14,
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w1 , 18) ,_mm256_slli_epi32( w0 , 14 ) ) ) );
w1 = _mm256_lddqu_si256 (compressed + 15);
_mm256_storeu_si256(out + 15,
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w0 , 17) ,_mm256_slli_epi32( w1 , 15 ) ) ) );
w0 = _mm256_lddqu_si256 (compressed + 16);
_mm256_storeu_si256(out + 16,
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w1 , 16) ,_mm256_slli_epi32( w0 , 16 ) ) ) );
w1 = _mm256_lddqu_si256 (compressed + 17);
_mm256_storeu_si256(out + 17,
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w0 , 15) ,_mm256_slli_epi32( w1 , 17 ) ) ) );
w0 = _mm256_lddqu_si256 (compressed + 18);
_mm256_storeu_si256(out + 18,
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w1 , 14) ,_mm256_slli_epi32( w0 , 18 ) ) ) );
w1 = _mm256_lddqu_si256 (compressed + 19);
_mm256_storeu_si256(out + 19,
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w0 , 13) ,_mm256_slli_epi32( w1 , 19 ) ) ) );
w0 = _mm256_lddqu_si256 (compressed + 20);
_mm256_storeu_si256(out + 20,
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w1 , 12) ,_mm256_slli_epi32( w0 , 20 ) ) ) );
w1 = _mm256_lddqu_si256 (compressed + 21);
_mm256_storeu_si256(out + 21,
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w0 , 11) ,_mm256_slli_epi32( w1 , 21 ) ) ) );
w0 = _mm256_lddqu_si256 (compressed + 22);
_mm256_storeu_si256(out + 22,
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w1 , 10) ,_mm256_slli_epi32( w0 , 22 ) ) ) );
w1 = _mm256_lddqu_si256 (compressed + 23);
_mm256_storeu_si256(out + 23,
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w0 , 9) ,_mm256_slli_epi32( w1 , 23 ) ) ) );
w0 = _mm256_lddqu_si256 (compressed + 24);
_mm256_storeu_si256(out + 24,
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w1 , 8) ,_mm256_slli_epi32( w0 , 24 ) ) ) );
w1 = _mm256_lddqu_si256 (compressed + 25);
_mm256_storeu_si256(out + 25,
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w0 , 7) ,_mm256_slli_epi32( w1 , 25 ) ) ) );
w0 = _mm256_lddqu_si256 (compressed + 26);
_mm256_storeu_si256(out + 26,
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w1 , 6) ,_mm256_slli_epi32( w0 , 26 ) ) ) );
w1 = _mm256_lddqu_si256 (compressed + 27);
_mm256_storeu_si256(out + 27,
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w0 , 5) ,_mm256_slli_epi32( w1 , 27 ) ) ) );
w0 = _mm256_lddqu_si256 (compressed + 28);
_mm256_storeu_si256(out + 28,
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w1 , 4) ,_mm256_slli_epi32( w0 , 28 ) ) ) );
w1 = _mm256_lddqu_si256 (compressed + 29);
_mm256_storeu_si256(out + 29,
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w0 , 3) ,_mm256_slli_epi32( w1 , 29 ) ) ) );
w0 = _mm256_lddqu_si256 (compressed + 30);
_mm256_storeu_si256(out + 30,
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w1 , 2) ,_mm256_slli_epi32( w0 , 30 ) ) ) );
_mm256_storeu_si256(out + 31, _mm256_srli_epi32( w0 , 1) );
}
/* we packed 256 32-bit values, touching 32 256-bit words, using 512 bytes */
static void avxunpackblock32(const __m256i * compressed, uint32_t * pout) {
/* we are going to access 32 256-bit words */
__m256i w0, w1;
__m256i * out = (__m256i *) pout;
w0 = _mm256_lddqu_si256 (compressed);
_mm256_storeu_si256(out + 0, w0 );
w1 = _mm256_lddqu_si256 (compressed + 1);
_mm256_storeu_si256(out + 1, w1 );
w0 = _mm256_lddqu_si256 (compressed + 2);
_mm256_storeu_si256(out + 2, w0 );
w1 = _mm256_lddqu_si256 (compressed + 3);
_mm256_storeu_si256(out + 3, w1 );
w0 = _mm256_lddqu_si256 (compressed + 4);
_mm256_storeu_si256(out + 4, w0 );
w1 = _mm256_lddqu_si256 (compressed + 5);
_mm256_storeu_si256(out + 5, w1 );
w0 = _mm256_lddqu_si256 (compressed + 6);
_mm256_storeu_si256(out + 6, w0 );
w1 = _mm256_lddqu_si256 (compressed + 7);
_mm256_storeu_si256(out + 7, w1 );
w0 = _mm256_lddqu_si256 (compressed + 8);
_mm256_storeu_si256(out + 8, w0 );
w1 = _mm256_lddqu_si256 (compressed + 9);
_mm256_storeu_si256(out + 9, w1 );
w0 = _mm256_lddqu_si256 (compressed + 10);
_mm256_storeu_si256(out + 10, w0 );
w1 = _mm256_lddqu_si256 (compressed + 11);
_mm256_storeu_si256(out + 11, w1 );
w0 = _mm256_lddqu_si256 (compressed + 12);
_mm256_storeu_si256(out + 12, w0 );
w1 = _mm256_lddqu_si256 (compressed + 13);
_mm256_storeu_si256(out + 13, w1 );
w0 = _mm256_lddqu_si256 (compressed + 14);
_mm256_storeu_si256(out + 14, w0 );
w1 = _mm256_lddqu_si256 (compressed + 15);
_mm256_storeu_si256(out + 15, w1 );
w0 = _mm256_lddqu_si256 (compressed + 16);
_mm256_storeu_si256(out + 16, w0 );
w1 = _mm256_lddqu_si256 (compressed + 17);
_mm256_storeu_si256(out + 17, w1 );
w0 = _mm256_lddqu_si256 (compressed + 18);
_mm256_storeu_si256(out + 18, w0 );
w1 = _mm256_lddqu_si256 (compressed + 19);
_mm256_storeu_si256(out + 19, w1 );
w0 = _mm256_lddqu_si256 (compressed + 20);
_mm256_storeu_si256(out + 20, w0 );
w1 = _mm256_lddqu_si256 (compressed + 21);
_mm256_storeu_si256(out + 21, w1 );
w0 = _mm256_lddqu_si256 (compressed + 22);
_mm256_storeu_si256(out + 22, w0 );
w1 = _mm256_lddqu_si256 (compressed + 23);
_mm256_storeu_si256(out + 23, w1 );
w0 = _mm256_lddqu_si256 (compressed + 24);
_mm256_storeu_si256(out + 24, w0 );
w1 = _mm256_lddqu_si256 (compressed + 25);
_mm256_storeu_si256(out + 25, w1 );
w0 = _mm256_lddqu_si256 (compressed + 26);
_mm256_storeu_si256(out + 26, w0 );
w1 = _mm256_lddqu_si256 (compressed + 27);
_mm256_storeu_si256(out + 27, w1 );
w0 = _mm256_lddqu_si256 (compressed + 28);
_mm256_storeu_si256(out + 28, w0 );
w1 = _mm256_lddqu_si256 (compressed + 29);
_mm256_storeu_si256(out + 29, w1 );
w0 = _mm256_lddqu_si256 (compressed + 30);
_mm256_storeu_si256(out + 30, w0 );
w1 = _mm256_lddqu_si256 (compressed + 31);
_mm256_storeu_si256(out + 31, w1 );
}
static avxpackblockfnc avxfuncPackArr[] = {
&avxpackblock0,
&avxpackblock1,
&avxpackblock2,
&avxpackblock3,
&avxpackblock4,
&avxpackblock5,
&avxpackblock6,
&avxpackblock7,
&avxpackblock8,
&avxpackblock9,
&avxpackblock10,
&avxpackblock11,
&avxpackblock12,
&avxpackblock13,
&avxpackblock14,
&avxpackblock15,
&avxpackblock16,
&avxpackblock17,
&avxpackblock18,
&avxpackblock19,
&avxpackblock20,
&avxpackblock21,
&avxpackblock22,
&avxpackblock23,
&avxpackblock24,
&avxpackblock25,
&avxpackblock26,
&avxpackblock27,
&avxpackblock28,
&avxpackblock29,
&avxpackblock30,
&avxpackblock31,
&avxpackblock32
};
static avxpackblockfnc avxfuncPackMaskArr[] = {
&avxpackblockmask0,
&avxpackblockmask1,
&avxpackblockmask2,
&avxpackblockmask3,
&avxpackblockmask4,
&avxpackblockmask5,
&avxpackblockmask6,
&avxpackblockmask7,
&avxpackblockmask8,
&avxpackblockmask9,
&avxpackblockmask10,
&avxpackblockmask11,
&avxpackblockmask12,
&avxpackblockmask13,
&avxpackblockmask14,
&avxpackblockmask15,
&avxpackblockmask16,
&avxpackblockmask17,
&avxpackblockmask18,
&avxpackblockmask19,
&avxpackblockmask20,
&avxpackblockmask21,
&avxpackblockmask22,
&avxpackblockmask23,
&avxpackblockmask24,
&avxpackblockmask25,
&avxpackblockmask26,
&avxpackblockmask27,
&avxpackblockmask28,
&avxpackblockmask29,
&avxpackblockmask30,
&avxpackblockmask31,
&avxpackblockmask32
};
static avxunpackblockfnc avxfuncUnpackArr[] = {
&avxunpackblock0,
&avxunpackblock1,
&avxunpackblock2,
&avxunpackblock3,
&avxunpackblock4,
&avxunpackblock5,
&avxunpackblock6,
&avxunpackblock7,
&avxunpackblock8,
&avxunpackblock9,
&avxunpackblock10,
&avxunpackblock11,
&avxunpackblock12,
&avxunpackblock13,
&avxunpackblock14,
&avxunpackblock15,
&avxunpackblock16,
&avxunpackblock17,
&avxunpackblock18,
&avxunpackblock19,
&avxunpackblock20,
&avxunpackblock21,
&avxunpackblock22,
&avxunpackblock23,
&avxunpackblock24,
&avxunpackblock25,
&avxunpackblock26,
&avxunpackblock27,
&avxunpackblock28,
&avxunpackblock29,
&avxunpackblock30,
&avxunpackblock31,
&avxunpackblock32
};
/** code generated by avxpacking.py ends here **/
/* reads 256 values from "in", writes "bit" 256-bit vectors to "out" */
void avxpack(const uint32_t * in,__m256i * out, const uint32_t bit) {
avxfuncPackMaskArr[bit](in,out);
}
/* reads 256 values from "in", writes "bit" 256-bit vectors to "out" */
void avxpackwithoutmask(const uint32_t * in,__m256i * out, const uint32_t bit) {
avxfuncPackArr[bit](in,out);
}
/* reads "bit" 256-bit vectors from "in", writes 256 values to "out" */
void avxunpack(const __m256i * in,uint32_t * out, const uint32_t bit) {
avxfuncUnpackArr[bit](in,out);
}
#endif /* __AVX2__ */