mirror of
https://github.com/quickwit-oss/tantivy.git
synced 2026-01-07 09:32:54 +00:00
7796 lines
428 KiB
C
7796 lines
428 KiB
C
#include "avxbitpacking.h"
|
|
#ifdef __AVX2__
|
|
|
|
|
|
static uint32_t maxbitas32int(const __m256i accumulator) {
|
|
const __m256i _tmp1 = _mm256_or_si256(_mm256_srli_si256(accumulator, 8), accumulator); /* (A,B,C,D) xor (0,0,A,B) = (A,B,C xor A,D xor B)*/
|
|
const __m256i _tmp2 = _mm256_or_si256(_mm256_srli_si256(_tmp1, 4), _tmp1); /* (A,B,C xor A,D xor B) xor (0,0,0,C xor A)*/
|
|
uint32_t ans1 = _mm256_extract_epi32(_tmp2,0);
|
|
uint32_t ans2 = _mm256_extract_epi32(_tmp2,4);
|
|
uint32_t ans = ans1 > ans2 ? ans1 : ans2;
|
|
return bits(ans);
|
|
}
|
|
|
|
uint32_t avxmaxbits(const uint32_t * begin) {
|
|
const __m256i* pin = (const __m256i*)(begin);
|
|
__m256i accumulator = _mm256_lddqu_si256(pin);
|
|
uint32_t k = 1;
|
|
for(; 8*k < AVXBlockSize; ++k) {
|
|
__m256i newvec = _mm256_lddqu_si256(pin+k);
|
|
accumulator = _mm256_or_si256(accumulator,newvec);
|
|
}
|
|
return maxbitas32int(accumulator);
|
|
}
|
|
|
|
|
|
/** code generated by avxpacking.py starts here **/
|
|
|
|
typedef void (*avxpackblockfnc)(const uint32_t * pin, __m256i * compressed);
|
|
typedef void (*avxunpackblockfnc)(const __m256i * compressed, uint32_t * pout);
|
|
|
|
static void avxpackblock0(const uint32_t * pin, __m256i * compressed) {
|
|
(void)compressed;
|
|
(void) pin; /* we consumed 256 32-bit integers */
|
|
}
|
|
|
|
|
|
/* we are going to pack 256 1-bit values, touching 1 256-bit words, using 16 bytes */
|
|
static void avxpackblock1(const uint32_t * pin, __m256i * compressed) {
|
|
const __m256i * in = (const __m256i *) pin;
|
|
/* we are going to touch 1 256-bit word */
|
|
__m256i w0;
|
|
w0 = _mm256_lddqu_si256 (in + 0);
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 1) , 1));
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 2) , 2));
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 3) , 3));
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 4) , 4));
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 5) , 5));
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 6) , 6));
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 7) , 7));
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 8) , 8));
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 9) , 9));
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 10) , 10));
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 11) , 11));
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 12) , 12));
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 13) , 13));
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 14) , 14));
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 15) , 15));
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 16) , 16));
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 17) , 17));
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 18) , 18));
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 19) , 19));
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 20) , 20));
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 21) , 21));
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 22) , 22));
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 23) , 23));
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 24) , 24));
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 25) , 25));
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 26) , 26));
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 27) , 27));
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 28) , 28));
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 29) , 29));
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 30) , 30));
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 31) , 31));
|
|
_mm256_storeu_si256(compressed + 0, w0);
|
|
}
|
|
|
|
|
|
/* we are going to pack 256 2-bit values, touching 2 256-bit words, using 32 bytes */
|
|
static void avxpackblock2(const uint32_t * pin, __m256i * compressed) {
|
|
const __m256i * in = (const __m256i *) pin;
|
|
/* we are going to touch 2 256-bit words */
|
|
__m256i w0, w1;
|
|
w0 = _mm256_lddqu_si256 (in + 0);
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 1) , 2));
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 2) , 4));
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 3) , 6));
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 4) , 8));
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 5) , 10));
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 6) , 12));
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 7) , 14));
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 8) , 16));
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 9) , 18));
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 10) , 20));
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 11) , 22));
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 12) , 24));
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 13) , 26));
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 14) , 28));
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 15) , 30));
|
|
_mm256_storeu_si256(compressed + 0, w0);
|
|
w1 = _mm256_lddqu_si256 (in + 16);
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 17) , 2));
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 18) , 4));
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 19) , 6));
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 20) , 8));
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 21) , 10));
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 22) , 12));
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 23) , 14));
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 24) , 16));
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 25) , 18));
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 26) , 20));
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 27) , 22));
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 28) , 24));
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 29) , 26));
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 30) , 28));
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 31) , 30));
|
|
_mm256_storeu_si256(compressed + 1, w1);
|
|
}
|
|
|
|
|
|
/* we are going to pack 256 3-bit values, touching 3 256-bit words, using 48 bytes */
|
|
static void avxpackblock3(const uint32_t * pin, __m256i * compressed) {
|
|
const __m256i * in = (const __m256i *) pin;
|
|
/* we are going to touch 3 256-bit words */
|
|
__m256i w0, w1;
|
|
__m256i tmp; /* used to store inputs at word boundary */
|
|
w0 = _mm256_lddqu_si256 (in + 0);
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 1) , 3));
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 2) , 6));
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 3) , 9));
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 4) , 12));
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 5) , 15));
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 6) , 18));
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 7) , 21));
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 8) , 24));
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 9) , 27));
|
|
tmp = _mm256_lddqu_si256 (in + 10);
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 30));
|
|
w1 = _mm256_srli_epi32(tmp,2);
|
|
_mm256_storeu_si256(compressed + 0, w0);
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 11) , 1));
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 12) , 4));
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 13) , 7));
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 14) , 10));
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 15) , 13));
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 16) , 16));
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 17) , 19));
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 18) , 22));
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 19) , 25));
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 20) , 28));
|
|
tmp = _mm256_lddqu_si256 (in + 21);
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 31));
|
|
w0 = _mm256_srli_epi32(tmp,1);
|
|
_mm256_storeu_si256(compressed + 1, w1);
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 22) , 2));
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 23) , 5));
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 24) , 8));
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 25) , 11));
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 26) , 14));
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 27) , 17));
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 28) , 20));
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 29) , 23));
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 30) , 26));
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 31) , 29));
|
|
_mm256_storeu_si256(compressed + 2, w0);
|
|
}
|
|
|
|
|
|
/* we are going to pack 256 4-bit values, touching 4 256-bit words, using 64 bytes */
|
|
static void avxpackblock4(const uint32_t * pin, __m256i * compressed) {
|
|
const __m256i * in = (const __m256i *) pin;
|
|
/* we are going to touch 4 256-bit words */
|
|
__m256i w0, w1;
|
|
w0 = _mm256_lddqu_si256 (in + 0);
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 1) , 4));
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 2) , 8));
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 3) , 12));
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 4) , 16));
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 5) , 20));
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 6) , 24));
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 7) , 28));
|
|
_mm256_storeu_si256(compressed + 0, w0);
|
|
w1 = _mm256_lddqu_si256 (in + 8);
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 9) , 4));
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 10) , 8));
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 11) , 12));
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 12) , 16));
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 13) , 20));
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 14) , 24));
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 15) , 28));
|
|
_mm256_storeu_si256(compressed + 1, w1);
|
|
w0 = _mm256_lddqu_si256 (in + 16);
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 17) , 4));
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 18) , 8));
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 19) , 12));
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 20) , 16));
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 21) , 20));
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 22) , 24));
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 23) , 28));
|
|
_mm256_storeu_si256(compressed + 2, w0);
|
|
w1 = _mm256_lddqu_si256 (in + 24);
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 25) , 4));
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 26) , 8));
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 27) , 12));
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 28) , 16));
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 29) , 20));
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 30) , 24));
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 31) , 28));
|
|
_mm256_storeu_si256(compressed + 3, w1);
|
|
}
|
|
|
|
|
|
/* we are going to pack 256 5-bit values, touching 5 256-bit words, using 80 bytes */
|
|
static void avxpackblock5(const uint32_t * pin, __m256i * compressed) {
|
|
const __m256i * in = (const __m256i *) pin;
|
|
/* we are going to touch 5 256-bit words */
|
|
__m256i w0, w1;
|
|
__m256i tmp; /* used to store inputs at word boundary */
|
|
w0 = _mm256_lddqu_si256 (in + 0);
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 1) , 5));
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 2) , 10));
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 3) , 15));
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 4) , 20));
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 5) , 25));
|
|
tmp = _mm256_lddqu_si256 (in + 6);
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 30));
|
|
w1 = _mm256_srli_epi32(tmp,2);
|
|
_mm256_storeu_si256(compressed + 0, w0);
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 7) , 3));
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 8) , 8));
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 9) , 13));
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 10) , 18));
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 11) , 23));
|
|
tmp = _mm256_lddqu_si256 (in + 12);
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 28));
|
|
w0 = _mm256_srli_epi32(tmp,4);
|
|
_mm256_storeu_si256(compressed + 1, w1);
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 13) , 1));
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 14) , 6));
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 15) , 11));
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 16) , 16));
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 17) , 21));
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 18) , 26));
|
|
tmp = _mm256_lddqu_si256 (in + 19);
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 31));
|
|
w1 = _mm256_srli_epi32(tmp,1);
|
|
_mm256_storeu_si256(compressed + 2, w0);
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 20) , 4));
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 21) , 9));
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 22) , 14));
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 23) , 19));
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 24) , 24));
|
|
tmp = _mm256_lddqu_si256 (in + 25);
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 29));
|
|
w0 = _mm256_srli_epi32(tmp,3);
|
|
_mm256_storeu_si256(compressed + 3, w1);
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 26) , 2));
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 27) , 7));
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 28) , 12));
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 29) , 17));
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 30) , 22));
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 31) , 27));
|
|
_mm256_storeu_si256(compressed + 4, w0);
|
|
}
|
|
|
|
|
|
/* we are going to pack 256 6-bit values, touching 6 256-bit words, using 96 bytes */
|
|
static void avxpackblock6(const uint32_t * pin, __m256i * compressed) {
|
|
const __m256i * in = (const __m256i *) pin;
|
|
/* we are going to touch 6 256-bit words */
|
|
__m256i w0, w1;
|
|
__m256i tmp; /* used to store inputs at word boundary */
|
|
w0 = _mm256_lddqu_si256 (in + 0);
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 1) , 6));
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 2) , 12));
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 3) , 18));
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 4) , 24));
|
|
tmp = _mm256_lddqu_si256 (in + 5);
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 30));
|
|
w1 = _mm256_srli_epi32(tmp,2);
|
|
_mm256_storeu_si256(compressed + 0, w0);
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 6) , 4));
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 7) , 10));
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 8) , 16));
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 9) , 22));
|
|
tmp = _mm256_lddqu_si256 (in + 10);
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 28));
|
|
w0 = _mm256_srli_epi32(tmp,4);
|
|
_mm256_storeu_si256(compressed + 1, w1);
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 11) , 2));
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 12) , 8));
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 13) , 14));
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 14) , 20));
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 15) , 26));
|
|
_mm256_storeu_si256(compressed + 2, w0);
|
|
w1 = _mm256_lddqu_si256 (in + 16);
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 17) , 6));
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 18) , 12));
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 19) , 18));
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 20) , 24));
|
|
tmp = _mm256_lddqu_si256 (in + 21);
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 30));
|
|
w0 = _mm256_srli_epi32(tmp,2);
|
|
_mm256_storeu_si256(compressed + 3, w1);
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 22) , 4));
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 23) , 10));
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 24) , 16));
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 25) , 22));
|
|
tmp = _mm256_lddqu_si256 (in + 26);
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 28));
|
|
w1 = _mm256_srli_epi32(tmp,4);
|
|
_mm256_storeu_si256(compressed + 4, w0);
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 27) , 2));
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 28) , 8));
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 29) , 14));
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 30) , 20));
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 31) , 26));
|
|
_mm256_storeu_si256(compressed + 5, w1);
|
|
}
|
|
|
|
|
|
/* we are going to pack 256 7-bit values, touching 7 256-bit words, using 112 bytes */
|
|
static void avxpackblock7(const uint32_t * pin, __m256i * compressed) {
|
|
const __m256i * in = (const __m256i *) pin;
|
|
/* we are going to touch 7 256-bit words */
|
|
__m256i w0, w1;
|
|
__m256i tmp; /* used to store inputs at word boundary */
|
|
w0 = _mm256_lddqu_si256 (in + 0);
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 1) , 7));
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 2) , 14));
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 3) , 21));
|
|
tmp = _mm256_lddqu_si256 (in + 4);
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 28));
|
|
w1 = _mm256_srli_epi32(tmp,4);
|
|
_mm256_storeu_si256(compressed + 0, w0);
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 5) , 3));
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 6) , 10));
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 7) , 17));
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 8) , 24));
|
|
tmp = _mm256_lddqu_si256 (in + 9);
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 31));
|
|
w0 = _mm256_srli_epi32(tmp,1);
|
|
_mm256_storeu_si256(compressed + 1, w1);
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 10) , 6));
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 11) , 13));
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 12) , 20));
|
|
tmp = _mm256_lddqu_si256 (in + 13);
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 27));
|
|
w1 = _mm256_srli_epi32(tmp,5);
|
|
_mm256_storeu_si256(compressed + 2, w0);
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 14) , 2));
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 15) , 9));
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 16) , 16));
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 17) , 23));
|
|
tmp = _mm256_lddqu_si256 (in + 18);
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 30));
|
|
w0 = _mm256_srli_epi32(tmp,2);
|
|
_mm256_storeu_si256(compressed + 3, w1);
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 19) , 5));
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 20) , 12));
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 21) , 19));
|
|
tmp = _mm256_lddqu_si256 (in + 22);
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 26));
|
|
w1 = _mm256_srli_epi32(tmp,6);
|
|
_mm256_storeu_si256(compressed + 4, w0);
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 23) , 1));
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 24) , 8));
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 25) , 15));
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 26) , 22));
|
|
tmp = _mm256_lddqu_si256 (in + 27);
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 29));
|
|
w0 = _mm256_srli_epi32(tmp,3);
|
|
_mm256_storeu_si256(compressed + 5, w1);
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 28) , 4));
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 29) , 11));
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 30) , 18));
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 31) , 25));
|
|
_mm256_storeu_si256(compressed + 6, w0);
|
|
}
|
|
|
|
|
|
/* we are going to pack 256 8-bit values, touching 8 256-bit words, using 128 bytes */
|
|
static void avxpackblock8(const uint32_t * pin, __m256i * compressed) {
|
|
const __m256i * in = (const __m256i *) pin;
|
|
/* we are going to touch 8 256-bit words */
|
|
__m256i w0, w1;
|
|
w0 = _mm256_lddqu_si256 (in + 0);
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 1) , 8));
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 2) , 16));
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 3) , 24));
|
|
_mm256_storeu_si256(compressed + 0, w0);
|
|
w1 = _mm256_lddqu_si256 (in + 4);
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 5) , 8));
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 6) , 16));
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 7) , 24));
|
|
_mm256_storeu_si256(compressed + 1, w1);
|
|
w0 = _mm256_lddqu_si256 (in + 8);
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 9) , 8));
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 10) , 16));
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 11) , 24));
|
|
_mm256_storeu_si256(compressed + 2, w0);
|
|
w1 = _mm256_lddqu_si256 (in + 12);
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 13) , 8));
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 14) , 16));
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 15) , 24));
|
|
_mm256_storeu_si256(compressed + 3, w1);
|
|
w0 = _mm256_lddqu_si256 (in + 16);
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 17) , 8));
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 18) , 16));
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 19) , 24));
|
|
_mm256_storeu_si256(compressed + 4, w0);
|
|
w1 = _mm256_lddqu_si256 (in + 20);
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 21) , 8));
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 22) , 16));
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 23) , 24));
|
|
_mm256_storeu_si256(compressed + 5, w1);
|
|
w0 = _mm256_lddqu_si256 (in + 24);
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 25) , 8));
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 26) , 16));
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 27) , 24));
|
|
_mm256_storeu_si256(compressed + 6, w0);
|
|
w1 = _mm256_lddqu_si256 (in + 28);
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 29) , 8));
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 30) , 16));
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 31) , 24));
|
|
_mm256_storeu_si256(compressed + 7, w1);
|
|
}
|
|
|
|
|
|
/* we are going to pack 256 9-bit values, touching 9 256-bit words, using 144 bytes */
|
|
static void avxpackblock9(const uint32_t * pin, __m256i * compressed) {
|
|
const __m256i * in = (const __m256i *) pin;
|
|
/* we are going to touch 9 256-bit words */
|
|
__m256i w0, w1;
|
|
__m256i tmp; /* used to store inputs at word boundary */
|
|
w0 = _mm256_lddqu_si256 (in + 0);
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 1) , 9));
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 2) , 18));
|
|
tmp = _mm256_lddqu_si256 (in + 3);
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 27));
|
|
w1 = _mm256_srli_epi32(tmp,5);
|
|
_mm256_storeu_si256(compressed + 0, w0);
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 4) , 4));
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 5) , 13));
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 6) , 22));
|
|
tmp = _mm256_lddqu_si256 (in + 7);
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 31));
|
|
w0 = _mm256_srli_epi32(tmp,1);
|
|
_mm256_storeu_si256(compressed + 1, w1);
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 8) , 8));
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 9) , 17));
|
|
tmp = _mm256_lddqu_si256 (in + 10);
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 26));
|
|
w1 = _mm256_srli_epi32(tmp,6);
|
|
_mm256_storeu_si256(compressed + 2, w0);
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 11) , 3));
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 12) , 12));
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 13) , 21));
|
|
tmp = _mm256_lddqu_si256 (in + 14);
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 30));
|
|
w0 = _mm256_srli_epi32(tmp,2);
|
|
_mm256_storeu_si256(compressed + 3, w1);
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 15) , 7));
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 16) , 16));
|
|
tmp = _mm256_lddqu_si256 (in + 17);
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 25));
|
|
w1 = _mm256_srli_epi32(tmp,7);
|
|
_mm256_storeu_si256(compressed + 4, w0);
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 18) , 2));
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 19) , 11));
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 20) , 20));
|
|
tmp = _mm256_lddqu_si256 (in + 21);
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 29));
|
|
w0 = _mm256_srli_epi32(tmp,3);
|
|
_mm256_storeu_si256(compressed + 5, w1);
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 22) , 6));
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 23) , 15));
|
|
tmp = _mm256_lddqu_si256 (in + 24);
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 24));
|
|
w1 = _mm256_srli_epi32(tmp,8);
|
|
_mm256_storeu_si256(compressed + 6, w0);
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 25) , 1));
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 26) , 10));
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 27) , 19));
|
|
tmp = _mm256_lddqu_si256 (in + 28);
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 28));
|
|
w0 = _mm256_srli_epi32(tmp,4);
|
|
_mm256_storeu_si256(compressed + 7, w1);
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 29) , 5));
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 30) , 14));
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 31) , 23));
|
|
_mm256_storeu_si256(compressed + 8, w0);
|
|
}
|
|
|
|
|
|
/* we are going to pack 256 10-bit values, touching 10 256-bit words, using 160 bytes */
|
|
static void avxpackblock10(const uint32_t * pin, __m256i * compressed) {
|
|
const __m256i * in = (const __m256i *) pin;
|
|
/* we are going to touch 10 256-bit words */
|
|
__m256i w0, w1;
|
|
__m256i tmp; /* used to store inputs at word boundary */
|
|
w0 = _mm256_lddqu_si256 (in + 0);
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 1) , 10));
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 2) , 20));
|
|
tmp = _mm256_lddqu_si256 (in + 3);
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 30));
|
|
w1 = _mm256_srli_epi32(tmp,2);
|
|
_mm256_storeu_si256(compressed + 0, w0);
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 4) , 8));
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 5) , 18));
|
|
tmp = _mm256_lddqu_si256 (in + 6);
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 28));
|
|
w0 = _mm256_srli_epi32(tmp,4);
|
|
_mm256_storeu_si256(compressed + 1, w1);
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 7) , 6));
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 8) , 16));
|
|
tmp = _mm256_lddqu_si256 (in + 9);
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 26));
|
|
w1 = _mm256_srli_epi32(tmp,6);
|
|
_mm256_storeu_si256(compressed + 2, w0);
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 10) , 4));
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 11) , 14));
|
|
tmp = _mm256_lddqu_si256 (in + 12);
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 24));
|
|
w0 = _mm256_srli_epi32(tmp,8);
|
|
_mm256_storeu_si256(compressed + 3, w1);
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 13) , 2));
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 14) , 12));
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 15) , 22));
|
|
_mm256_storeu_si256(compressed + 4, w0);
|
|
w1 = _mm256_lddqu_si256 (in + 16);
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 17) , 10));
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 18) , 20));
|
|
tmp = _mm256_lddqu_si256 (in + 19);
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 30));
|
|
w0 = _mm256_srli_epi32(tmp,2);
|
|
_mm256_storeu_si256(compressed + 5, w1);
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 20) , 8));
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 21) , 18));
|
|
tmp = _mm256_lddqu_si256 (in + 22);
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 28));
|
|
w1 = _mm256_srli_epi32(tmp,4);
|
|
_mm256_storeu_si256(compressed + 6, w0);
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 23) , 6));
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 24) , 16));
|
|
tmp = _mm256_lddqu_si256 (in + 25);
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 26));
|
|
w0 = _mm256_srli_epi32(tmp,6);
|
|
_mm256_storeu_si256(compressed + 7, w1);
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 26) , 4));
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 27) , 14));
|
|
tmp = _mm256_lddqu_si256 (in + 28);
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 24));
|
|
w1 = _mm256_srli_epi32(tmp,8);
|
|
_mm256_storeu_si256(compressed + 8, w0);
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 29) , 2));
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 30) , 12));
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 31) , 22));
|
|
_mm256_storeu_si256(compressed + 9, w1);
|
|
}
|
|
|
|
|
|
/* we are going to pack 256 11-bit values, touching 11 256-bit words, using 176 bytes */
|
|
static void avxpackblock11(const uint32_t * pin, __m256i * compressed) {
|
|
const __m256i * in = (const __m256i *) pin;
|
|
/* we are going to touch 11 256-bit words */
|
|
__m256i w0, w1;
|
|
__m256i tmp; /* used to store inputs at word boundary */
|
|
w0 = _mm256_lddqu_si256 (in + 0);
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 1) , 11));
|
|
tmp = _mm256_lddqu_si256 (in + 2);
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 22));
|
|
w1 = _mm256_srli_epi32(tmp,10);
|
|
_mm256_storeu_si256(compressed + 0, w0);
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 3) , 1));
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 4) , 12));
|
|
tmp = _mm256_lddqu_si256 (in + 5);
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 23));
|
|
w0 = _mm256_srli_epi32(tmp,9);
|
|
_mm256_storeu_si256(compressed + 1, w1);
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 6) , 2));
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 7) , 13));
|
|
tmp = _mm256_lddqu_si256 (in + 8);
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 24));
|
|
w1 = _mm256_srli_epi32(tmp,8);
|
|
_mm256_storeu_si256(compressed + 2, w0);
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 9) , 3));
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 10) , 14));
|
|
tmp = _mm256_lddqu_si256 (in + 11);
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 25));
|
|
w0 = _mm256_srli_epi32(tmp,7);
|
|
_mm256_storeu_si256(compressed + 3, w1);
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 12) , 4));
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 13) , 15));
|
|
tmp = _mm256_lddqu_si256 (in + 14);
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 26));
|
|
w1 = _mm256_srli_epi32(tmp,6);
|
|
_mm256_storeu_si256(compressed + 4, w0);
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 15) , 5));
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 16) , 16));
|
|
tmp = _mm256_lddqu_si256 (in + 17);
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 27));
|
|
w0 = _mm256_srli_epi32(tmp,5);
|
|
_mm256_storeu_si256(compressed + 5, w1);
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 18) , 6));
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 19) , 17));
|
|
tmp = _mm256_lddqu_si256 (in + 20);
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 28));
|
|
w1 = _mm256_srli_epi32(tmp,4);
|
|
_mm256_storeu_si256(compressed + 6, w0);
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 21) , 7));
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 22) , 18));
|
|
tmp = _mm256_lddqu_si256 (in + 23);
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 29));
|
|
w0 = _mm256_srli_epi32(tmp,3);
|
|
_mm256_storeu_si256(compressed + 7, w1);
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 24) , 8));
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 25) , 19));
|
|
tmp = _mm256_lddqu_si256 (in + 26);
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 30));
|
|
w1 = _mm256_srli_epi32(tmp,2);
|
|
_mm256_storeu_si256(compressed + 8, w0);
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 27) , 9));
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 28) , 20));
|
|
tmp = _mm256_lddqu_si256 (in + 29);
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 31));
|
|
w0 = _mm256_srli_epi32(tmp,1);
|
|
_mm256_storeu_si256(compressed + 9, w1);
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 30) , 10));
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 31) , 21));
|
|
_mm256_storeu_si256(compressed + 10, w0);
|
|
}
|
|
|
|
|
|
/* we are going to pack 256 12-bit values, touching 12 256-bit words, using 192 bytes */
|
|
static void avxpackblock12(const uint32_t * pin, __m256i * compressed) {
|
|
const __m256i * in = (const __m256i *) pin;
|
|
/* we are going to touch 12 256-bit words */
|
|
__m256i w0, w1;
|
|
__m256i tmp; /* used to store inputs at word boundary */
|
|
w0 = _mm256_lddqu_si256 (in + 0);
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 1) , 12));
|
|
tmp = _mm256_lddqu_si256 (in + 2);
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 24));
|
|
w1 = _mm256_srli_epi32(tmp,8);
|
|
_mm256_storeu_si256(compressed + 0, w0);
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 3) , 4));
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 4) , 16));
|
|
tmp = _mm256_lddqu_si256 (in + 5);
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 28));
|
|
w0 = _mm256_srli_epi32(tmp,4);
|
|
_mm256_storeu_si256(compressed + 1, w1);
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 6) , 8));
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 7) , 20));
|
|
_mm256_storeu_si256(compressed + 2, w0);
|
|
w1 = _mm256_lddqu_si256 (in + 8);
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 9) , 12));
|
|
tmp = _mm256_lddqu_si256 (in + 10);
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 24));
|
|
w0 = _mm256_srli_epi32(tmp,8);
|
|
_mm256_storeu_si256(compressed + 3, w1);
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 11) , 4));
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 12) , 16));
|
|
tmp = _mm256_lddqu_si256 (in + 13);
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 28));
|
|
w1 = _mm256_srli_epi32(tmp,4);
|
|
_mm256_storeu_si256(compressed + 4, w0);
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 14) , 8));
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 15) , 20));
|
|
_mm256_storeu_si256(compressed + 5, w1);
|
|
w0 = _mm256_lddqu_si256 (in + 16);
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 17) , 12));
|
|
tmp = _mm256_lddqu_si256 (in + 18);
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 24));
|
|
w1 = _mm256_srli_epi32(tmp,8);
|
|
_mm256_storeu_si256(compressed + 6, w0);
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 19) , 4));
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 20) , 16));
|
|
tmp = _mm256_lddqu_si256 (in + 21);
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 28));
|
|
w0 = _mm256_srli_epi32(tmp,4);
|
|
_mm256_storeu_si256(compressed + 7, w1);
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 22) , 8));
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 23) , 20));
|
|
_mm256_storeu_si256(compressed + 8, w0);
|
|
w1 = _mm256_lddqu_si256 (in + 24);
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 25) , 12));
|
|
tmp = _mm256_lddqu_si256 (in + 26);
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 24));
|
|
w0 = _mm256_srli_epi32(tmp,8);
|
|
_mm256_storeu_si256(compressed + 9, w1);
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 27) , 4));
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 28) , 16));
|
|
tmp = _mm256_lddqu_si256 (in + 29);
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 28));
|
|
w1 = _mm256_srli_epi32(tmp,4);
|
|
_mm256_storeu_si256(compressed + 10, w0);
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 30) , 8));
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 31) , 20));
|
|
_mm256_storeu_si256(compressed + 11, w1);
|
|
}
|
|
|
|
|
|
/* we are going to pack 256 13-bit values, touching 13 256-bit words, using 208 bytes */
|
|
static void avxpackblock13(const uint32_t * pin, __m256i * compressed) {
|
|
const __m256i * in = (const __m256i *) pin;
|
|
/* we are going to touch 13 256-bit words */
|
|
__m256i w0, w1;
|
|
__m256i tmp; /* used to store inputs at word boundary */
|
|
w0 = _mm256_lddqu_si256 (in + 0);
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 1) , 13));
|
|
tmp = _mm256_lddqu_si256 (in + 2);
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 26));
|
|
w1 = _mm256_srli_epi32(tmp,6);
|
|
_mm256_storeu_si256(compressed + 0, w0);
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 3) , 7));
|
|
tmp = _mm256_lddqu_si256 (in + 4);
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 20));
|
|
w0 = _mm256_srli_epi32(tmp,12);
|
|
_mm256_storeu_si256(compressed + 1, w1);
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 5) , 1));
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 6) , 14));
|
|
tmp = _mm256_lddqu_si256 (in + 7);
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 27));
|
|
w1 = _mm256_srli_epi32(tmp,5);
|
|
_mm256_storeu_si256(compressed + 2, w0);
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 8) , 8));
|
|
tmp = _mm256_lddqu_si256 (in + 9);
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 21));
|
|
w0 = _mm256_srli_epi32(tmp,11);
|
|
_mm256_storeu_si256(compressed + 3, w1);
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 10) , 2));
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 11) , 15));
|
|
tmp = _mm256_lddqu_si256 (in + 12);
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 28));
|
|
w1 = _mm256_srli_epi32(tmp,4);
|
|
_mm256_storeu_si256(compressed + 4, w0);
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 13) , 9));
|
|
tmp = _mm256_lddqu_si256 (in + 14);
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 22));
|
|
w0 = _mm256_srli_epi32(tmp,10);
|
|
_mm256_storeu_si256(compressed + 5, w1);
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 15) , 3));
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 16) , 16));
|
|
tmp = _mm256_lddqu_si256 (in + 17);
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 29));
|
|
w1 = _mm256_srli_epi32(tmp,3);
|
|
_mm256_storeu_si256(compressed + 6, w0);
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 18) , 10));
|
|
tmp = _mm256_lddqu_si256 (in + 19);
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 23));
|
|
w0 = _mm256_srli_epi32(tmp,9);
|
|
_mm256_storeu_si256(compressed + 7, w1);
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 20) , 4));
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 21) , 17));
|
|
tmp = _mm256_lddqu_si256 (in + 22);
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 30));
|
|
w1 = _mm256_srli_epi32(tmp,2);
|
|
_mm256_storeu_si256(compressed + 8, w0);
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 23) , 11));
|
|
tmp = _mm256_lddqu_si256 (in + 24);
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 24));
|
|
w0 = _mm256_srli_epi32(tmp,8);
|
|
_mm256_storeu_si256(compressed + 9, w1);
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 25) , 5));
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 26) , 18));
|
|
tmp = _mm256_lddqu_si256 (in + 27);
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 31));
|
|
w1 = _mm256_srli_epi32(tmp,1);
|
|
_mm256_storeu_si256(compressed + 10, w0);
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 28) , 12));
|
|
tmp = _mm256_lddqu_si256 (in + 29);
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 25));
|
|
w0 = _mm256_srli_epi32(tmp,7);
|
|
_mm256_storeu_si256(compressed + 11, w1);
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 30) , 6));
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 31) , 19));
|
|
_mm256_storeu_si256(compressed + 12, w0);
|
|
}
|
|
|
|
|
|
/* we are going to pack 256 14-bit values, touching 14 256-bit words, using 224 bytes */
|
|
static void avxpackblock14(const uint32_t * pin, __m256i * compressed) {
|
|
const __m256i * in = (const __m256i *) pin;
|
|
/* we are going to touch 14 256-bit words */
|
|
__m256i w0, w1;
|
|
__m256i tmp; /* used to store inputs at word boundary */
|
|
w0 = _mm256_lddqu_si256 (in + 0);
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 1) , 14));
|
|
tmp = _mm256_lddqu_si256 (in + 2);
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 28));
|
|
w1 = _mm256_srli_epi32(tmp,4);
|
|
_mm256_storeu_si256(compressed + 0, w0);
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 3) , 10));
|
|
tmp = _mm256_lddqu_si256 (in + 4);
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 24));
|
|
w0 = _mm256_srli_epi32(tmp,8);
|
|
_mm256_storeu_si256(compressed + 1, w1);
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 5) , 6));
|
|
tmp = _mm256_lddqu_si256 (in + 6);
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 20));
|
|
w1 = _mm256_srli_epi32(tmp,12);
|
|
_mm256_storeu_si256(compressed + 2, w0);
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 7) , 2));
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 8) , 16));
|
|
tmp = _mm256_lddqu_si256 (in + 9);
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 30));
|
|
w0 = _mm256_srli_epi32(tmp,2);
|
|
_mm256_storeu_si256(compressed + 3, w1);
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 10) , 12));
|
|
tmp = _mm256_lddqu_si256 (in + 11);
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 26));
|
|
w1 = _mm256_srli_epi32(tmp,6);
|
|
_mm256_storeu_si256(compressed + 4, w0);
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 12) , 8));
|
|
tmp = _mm256_lddqu_si256 (in + 13);
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 22));
|
|
w0 = _mm256_srli_epi32(tmp,10);
|
|
_mm256_storeu_si256(compressed + 5, w1);
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 14) , 4));
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 15) , 18));
|
|
_mm256_storeu_si256(compressed + 6, w0);
|
|
w1 = _mm256_lddqu_si256 (in + 16);
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 17) , 14));
|
|
tmp = _mm256_lddqu_si256 (in + 18);
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 28));
|
|
w0 = _mm256_srli_epi32(tmp,4);
|
|
_mm256_storeu_si256(compressed + 7, w1);
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 19) , 10));
|
|
tmp = _mm256_lddqu_si256 (in + 20);
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 24));
|
|
w1 = _mm256_srli_epi32(tmp,8);
|
|
_mm256_storeu_si256(compressed + 8, w0);
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 21) , 6));
|
|
tmp = _mm256_lddqu_si256 (in + 22);
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 20));
|
|
w0 = _mm256_srli_epi32(tmp,12);
|
|
_mm256_storeu_si256(compressed + 9, w1);
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 23) , 2));
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 24) , 16));
|
|
tmp = _mm256_lddqu_si256 (in + 25);
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 30));
|
|
w1 = _mm256_srli_epi32(tmp,2);
|
|
_mm256_storeu_si256(compressed + 10, w0);
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 26) , 12));
|
|
tmp = _mm256_lddqu_si256 (in + 27);
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 26));
|
|
w0 = _mm256_srli_epi32(tmp,6);
|
|
_mm256_storeu_si256(compressed + 11, w1);
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 28) , 8));
|
|
tmp = _mm256_lddqu_si256 (in + 29);
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 22));
|
|
w1 = _mm256_srli_epi32(tmp,10);
|
|
_mm256_storeu_si256(compressed + 12, w0);
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 30) , 4));
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 31) , 18));
|
|
_mm256_storeu_si256(compressed + 13, w1);
|
|
}
|
|
|
|
|
|
/* we are going to pack 256 15-bit values, touching 15 256-bit words, using 240 bytes */
|
|
static void avxpackblock15(const uint32_t * pin, __m256i * compressed) {
|
|
const __m256i * in = (const __m256i *) pin;
|
|
/* we are going to touch 15 256-bit words */
|
|
__m256i w0, w1;
|
|
__m256i tmp; /* used to store inputs at word boundary */
|
|
w0 = _mm256_lddqu_si256 (in + 0);
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 1) , 15));
|
|
tmp = _mm256_lddqu_si256 (in + 2);
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 30));
|
|
w1 = _mm256_srli_epi32(tmp,2);
|
|
_mm256_storeu_si256(compressed + 0, w0);
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 3) , 13));
|
|
tmp = _mm256_lddqu_si256 (in + 4);
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 28));
|
|
w0 = _mm256_srli_epi32(tmp,4);
|
|
_mm256_storeu_si256(compressed + 1, w1);
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 5) , 11));
|
|
tmp = _mm256_lddqu_si256 (in + 6);
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 26));
|
|
w1 = _mm256_srli_epi32(tmp,6);
|
|
_mm256_storeu_si256(compressed + 2, w0);
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 7) , 9));
|
|
tmp = _mm256_lddqu_si256 (in + 8);
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 24));
|
|
w0 = _mm256_srli_epi32(tmp,8);
|
|
_mm256_storeu_si256(compressed + 3, w1);
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 9) , 7));
|
|
tmp = _mm256_lddqu_si256 (in + 10);
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 22));
|
|
w1 = _mm256_srli_epi32(tmp,10);
|
|
_mm256_storeu_si256(compressed + 4, w0);
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 11) , 5));
|
|
tmp = _mm256_lddqu_si256 (in + 12);
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 20));
|
|
w0 = _mm256_srli_epi32(tmp,12);
|
|
_mm256_storeu_si256(compressed + 5, w1);
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 13) , 3));
|
|
tmp = _mm256_lddqu_si256 (in + 14);
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 18));
|
|
w1 = _mm256_srli_epi32(tmp,14);
|
|
_mm256_storeu_si256(compressed + 6, w0);
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 15) , 1));
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 16) , 16));
|
|
tmp = _mm256_lddqu_si256 (in + 17);
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 31));
|
|
w0 = _mm256_srli_epi32(tmp,1);
|
|
_mm256_storeu_si256(compressed + 7, w1);
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 18) , 14));
|
|
tmp = _mm256_lddqu_si256 (in + 19);
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 29));
|
|
w1 = _mm256_srli_epi32(tmp,3);
|
|
_mm256_storeu_si256(compressed + 8, w0);
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 20) , 12));
|
|
tmp = _mm256_lddqu_si256 (in + 21);
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 27));
|
|
w0 = _mm256_srli_epi32(tmp,5);
|
|
_mm256_storeu_si256(compressed + 9, w1);
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 22) , 10));
|
|
tmp = _mm256_lddqu_si256 (in + 23);
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 25));
|
|
w1 = _mm256_srli_epi32(tmp,7);
|
|
_mm256_storeu_si256(compressed + 10, w0);
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 24) , 8));
|
|
tmp = _mm256_lddqu_si256 (in + 25);
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 23));
|
|
w0 = _mm256_srli_epi32(tmp,9);
|
|
_mm256_storeu_si256(compressed + 11, w1);
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 26) , 6));
|
|
tmp = _mm256_lddqu_si256 (in + 27);
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 21));
|
|
w1 = _mm256_srli_epi32(tmp,11);
|
|
_mm256_storeu_si256(compressed + 12, w0);
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 28) , 4));
|
|
tmp = _mm256_lddqu_si256 (in + 29);
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 19));
|
|
w0 = _mm256_srli_epi32(tmp,13);
|
|
_mm256_storeu_si256(compressed + 13, w1);
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 30) , 2));
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 31) , 17));
|
|
_mm256_storeu_si256(compressed + 14, w0);
|
|
}
|
|
|
|
|
|
/* we are going to pack 256 16-bit values, touching 16 256-bit words, using 256 bytes */
|
|
static void avxpackblock16(const uint32_t * pin, __m256i * compressed) {
|
|
const __m256i * in = (const __m256i *) pin;
|
|
/* we are going to touch 16 256-bit words */
|
|
__m256i w0, w1;
|
|
w0 = _mm256_lddqu_si256 (in + 0);
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 1) , 16));
|
|
_mm256_storeu_si256(compressed + 0, w0);
|
|
w1 = _mm256_lddqu_si256 (in + 2);
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 3) , 16));
|
|
_mm256_storeu_si256(compressed + 1, w1);
|
|
w0 = _mm256_lddqu_si256 (in + 4);
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 5) , 16));
|
|
_mm256_storeu_si256(compressed + 2, w0);
|
|
w1 = _mm256_lddqu_si256 (in + 6);
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 7) , 16));
|
|
_mm256_storeu_si256(compressed + 3, w1);
|
|
w0 = _mm256_lddqu_si256 (in + 8);
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 9) , 16));
|
|
_mm256_storeu_si256(compressed + 4, w0);
|
|
w1 = _mm256_lddqu_si256 (in + 10);
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 11) , 16));
|
|
_mm256_storeu_si256(compressed + 5, w1);
|
|
w0 = _mm256_lddqu_si256 (in + 12);
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 13) , 16));
|
|
_mm256_storeu_si256(compressed + 6, w0);
|
|
w1 = _mm256_lddqu_si256 (in + 14);
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 15) , 16));
|
|
_mm256_storeu_si256(compressed + 7, w1);
|
|
w0 = _mm256_lddqu_si256 (in + 16);
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 17) , 16));
|
|
_mm256_storeu_si256(compressed + 8, w0);
|
|
w1 = _mm256_lddqu_si256 (in + 18);
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 19) , 16));
|
|
_mm256_storeu_si256(compressed + 9, w1);
|
|
w0 = _mm256_lddqu_si256 (in + 20);
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 21) , 16));
|
|
_mm256_storeu_si256(compressed + 10, w0);
|
|
w1 = _mm256_lddqu_si256 (in + 22);
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 23) , 16));
|
|
_mm256_storeu_si256(compressed + 11, w1);
|
|
w0 = _mm256_lddqu_si256 (in + 24);
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 25) , 16));
|
|
_mm256_storeu_si256(compressed + 12, w0);
|
|
w1 = _mm256_lddqu_si256 (in + 26);
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 27) , 16));
|
|
_mm256_storeu_si256(compressed + 13, w1);
|
|
w0 = _mm256_lddqu_si256 (in + 28);
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 29) , 16));
|
|
_mm256_storeu_si256(compressed + 14, w0);
|
|
w1 = _mm256_lddqu_si256 (in + 30);
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 31) , 16));
|
|
_mm256_storeu_si256(compressed + 15, w1);
|
|
}
|
|
|
|
|
|
/* we are going to pack 256 17-bit values, touching 17 256-bit words, using 272 bytes */
|
|
static void avxpackblock17(const uint32_t * pin, __m256i * compressed) {
|
|
const __m256i * in = (const __m256i *) pin;
|
|
/* we are going to touch 17 256-bit words */
|
|
__m256i w0, w1;
|
|
__m256i tmp; /* used to store inputs at word boundary */
|
|
w0 = _mm256_lddqu_si256 (in + 0);
|
|
tmp = _mm256_lddqu_si256 (in + 1);
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 17));
|
|
w1 = _mm256_srli_epi32(tmp,15);
|
|
_mm256_storeu_si256(compressed + 0, w0);
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 2) , 2));
|
|
tmp = _mm256_lddqu_si256 (in + 3);
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 19));
|
|
w0 = _mm256_srli_epi32(tmp,13);
|
|
_mm256_storeu_si256(compressed + 1, w1);
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 4) , 4));
|
|
tmp = _mm256_lddqu_si256 (in + 5);
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 21));
|
|
w1 = _mm256_srli_epi32(tmp,11);
|
|
_mm256_storeu_si256(compressed + 2, w0);
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 6) , 6));
|
|
tmp = _mm256_lddqu_si256 (in + 7);
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 23));
|
|
w0 = _mm256_srli_epi32(tmp,9);
|
|
_mm256_storeu_si256(compressed + 3, w1);
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 8) , 8));
|
|
tmp = _mm256_lddqu_si256 (in + 9);
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 25));
|
|
w1 = _mm256_srli_epi32(tmp,7);
|
|
_mm256_storeu_si256(compressed + 4, w0);
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 10) , 10));
|
|
tmp = _mm256_lddqu_si256 (in + 11);
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 27));
|
|
w0 = _mm256_srli_epi32(tmp,5);
|
|
_mm256_storeu_si256(compressed + 5, w1);
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 12) , 12));
|
|
tmp = _mm256_lddqu_si256 (in + 13);
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 29));
|
|
w1 = _mm256_srli_epi32(tmp,3);
|
|
_mm256_storeu_si256(compressed + 6, w0);
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 14) , 14));
|
|
tmp = _mm256_lddqu_si256 (in + 15);
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 31));
|
|
w0 = _mm256_srli_epi32(tmp,1);
|
|
_mm256_storeu_si256(compressed + 7, w1);
|
|
tmp = _mm256_lddqu_si256 (in + 16);
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 16));
|
|
w1 = _mm256_srli_epi32(tmp,16);
|
|
_mm256_storeu_si256(compressed + 8, w0);
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 17) , 1));
|
|
tmp = _mm256_lddqu_si256 (in + 18);
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 18));
|
|
w0 = _mm256_srli_epi32(tmp,14);
|
|
_mm256_storeu_si256(compressed + 9, w1);
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 19) , 3));
|
|
tmp = _mm256_lddqu_si256 (in + 20);
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 20));
|
|
w1 = _mm256_srli_epi32(tmp,12);
|
|
_mm256_storeu_si256(compressed + 10, w0);
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 21) , 5));
|
|
tmp = _mm256_lddqu_si256 (in + 22);
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 22));
|
|
w0 = _mm256_srli_epi32(tmp,10);
|
|
_mm256_storeu_si256(compressed + 11, w1);
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 23) , 7));
|
|
tmp = _mm256_lddqu_si256 (in + 24);
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 24));
|
|
w1 = _mm256_srli_epi32(tmp,8);
|
|
_mm256_storeu_si256(compressed + 12, w0);
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 25) , 9));
|
|
tmp = _mm256_lddqu_si256 (in + 26);
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 26));
|
|
w0 = _mm256_srli_epi32(tmp,6);
|
|
_mm256_storeu_si256(compressed + 13, w1);
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 27) , 11));
|
|
tmp = _mm256_lddqu_si256 (in + 28);
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 28));
|
|
w1 = _mm256_srli_epi32(tmp,4);
|
|
_mm256_storeu_si256(compressed + 14, w0);
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 29) , 13));
|
|
tmp = _mm256_lddqu_si256 (in + 30);
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 30));
|
|
w0 = _mm256_srli_epi32(tmp,2);
|
|
_mm256_storeu_si256(compressed + 15, w1);
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 31) , 15));
|
|
_mm256_storeu_si256(compressed + 16, w0);
|
|
}
|
|
|
|
|
|
/* we are going to pack 256 18-bit values, touching 18 256-bit words, using 288 bytes */
|
|
static void avxpackblock18(const uint32_t * pin, __m256i * compressed) {
|
|
const __m256i * in = (const __m256i *) pin;
|
|
/* we are going to touch 18 256-bit words */
|
|
__m256i w0, w1;
|
|
__m256i tmp; /* used to store inputs at word boundary */
|
|
w0 = _mm256_lddqu_si256 (in + 0);
|
|
tmp = _mm256_lddqu_si256 (in + 1);
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 18));
|
|
w1 = _mm256_srli_epi32(tmp,14);
|
|
_mm256_storeu_si256(compressed + 0, w0);
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 2) , 4));
|
|
tmp = _mm256_lddqu_si256 (in + 3);
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 22));
|
|
w0 = _mm256_srli_epi32(tmp,10);
|
|
_mm256_storeu_si256(compressed + 1, w1);
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 4) , 8));
|
|
tmp = _mm256_lddqu_si256 (in + 5);
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 26));
|
|
w1 = _mm256_srli_epi32(tmp,6);
|
|
_mm256_storeu_si256(compressed + 2, w0);
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 6) , 12));
|
|
tmp = _mm256_lddqu_si256 (in + 7);
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 30));
|
|
w0 = _mm256_srli_epi32(tmp,2);
|
|
_mm256_storeu_si256(compressed + 3, w1);
|
|
tmp = _mm256_lddqu_si256 (in + 8);
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 16));
|
|
w1 = _mm256_srli_epi32(tmp,16);
|
|
_mm256_storeu_si256(compressed + 4, w0);
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 9) , 2));
|
|
tmp = _mm256_lddqu_si256 (in + 10);
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 20));
|
|
w0 = _mm256_srli_epi32(tmp,12);
|
|
_mm256_storeu_si256(compressed + 5, w1);
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 11) , 6));
|
|
tmp = _mm256_lddqu_si256 (in + 12);
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 24));
|
|
w1 = _mm256_srli_epi32(tmp,8);
|
|
_mm256_storeu_si256(compressed + 6, w0);
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 13) , 10));
|
|
tmp = _mm256_lddqu_si256 (in + 14);
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 28));
|
|
w0 = _mm256_srli_epi32(tmp,4);
|
|
_mm256_storeu_si256(compressed + 7, w1);
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 15) , 14));
|
|
_mm256_storeu_si256(compressed + 8, w0);
|
|
w1 = _mm256_lddqu_si256 (in + 16);
|
|
tmp = _mm256_lddqu_si256 (in + 17);
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 18));
|
|
w0 = _mm256_srli_epi32(tmp,14);
|
|
_mm256_storeu_si256(compressed + 9, w1);
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 18) , 4));
|
|
tmp = _mm256_lddqu_si256 (in + 19);
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 22));
|
|
w1 = _mm256_srli_epi32(tmp,10);
|
|
_mm256_storeu_si256(compressed + 10, w0);
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 20) , 8));
|
|
tmp = _mm256_lddqu_si256 (in + 21);
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 26));
|
|
w0 = _mm256_srli_epi32(tmp,6);
|
|
_mm256_storeu_si256(compressed + 11, w1);
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 22) , 12));
|
|
tmp = _mm256_lddqu_si256 (in + 23);
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 30));
|
|
w1 = _mm256_srli_epi32(tmp,2);
|
|
_mm256_storeu_si256(compressed + 12, w0);
|
|
tmp = _mm256_lddqu_si256 (in + 24);
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 16));
|
|
w0 = _mm256_srli_epi32(tmp,16);
|
|
_mm256_storeu_si256(compressed + 13, w1);
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 25) , 2));
|
|
tmp = _mm256_lddqu_si256 (in + 26);
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 20));
|
|
w1 = _mm256_srli_epi32(tmp,12);
|
|
_mm256_storeu_si256(compressed + 14, w0);
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 27) , 6));
|
|
tmp = _mm256_lddqu_si256 (in + 28);
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 24));
|
|
w0 = _mm256_srli_epi32(tmp,8);
|
|
_mm256_storeu_si256(compressed + 15, w1);
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 29) , 10));
|
|
tmp = _mm256_lddqu_si256 (in + 30);
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 28));
|
|
w1 = _mm256_srli_epi32(tmp,4);
|
|
_mm256_storeu_si256(compressed + 16, w0);
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 31) , 14));
|
|
_mm256_storeu_si256(compressed + 17, w1);
|
|
}
|
|
|
|
|
|
/* we are going to pack 256 19-bit values, touching 19 256-bit words, using 304 bytes */
|
|
static void avxpackblock19(const uint32_t * pin, __m256i * compressed) {
|
|
const __m256i * in = (const __m256i *) pin;
|
|
/* we are going to touch 19 256-bit words */
|
|
__m256i w0, w1;
|
|
__m256i tmp; /* used to store inputs at word boundary */
|
|
w0 = _mm256_lddqu_si256 (in + 0);
|
|
tmp = _mm256_lddqu_si256 (in + 1);
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 19));
|
|
w1 = _mm256_srli_epi32(tmp,13);
|
|
_mm256_storeu_si256(compressed + 0, w0);
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 2) , 6));
|
|
tmp = _mm256_lddqu_si256 (in + 3);
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 25));
|
|
w0 = _mm256_srli_epi32(tmp,7);
|
|
_mm256_storeu_si256(compressed + 1, w1);
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 4) , 12));
|
|
tmp = _mm256_lddqu_si256 (in + 5);
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 31));
|
|
w1 = _mm256_srli_epi32(tmp,1);
|
|
_mm256_storeu_si256(compressed + 2, w0);
|
|
tmp = _mm256_lddqu_si256 (in + 6);
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 18));
|
|
w0 = _mm256_srli_epi32(tmp,14);
|
|
_mm256_storeu_si256(compressed + 3, w1);
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 7) , 5));
|
|
tmp = _mm256_lddqu_si256 (in + 8);
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 24));
|
|
w1 = _mm256_srli_epi32(tmp,8);
|
|
_mm256_storeu_si256(compressed + 4, w0);
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 9) , 11));
|
|
tmp = _mm256_lddqu_si256 (in + 10);
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 30));
|
|
w0 = _mm256_srli_epi32(tmp,2);
|
|
_mm256_storeu_si256(compressed + 5, w1);
|
|
tmp = _mm256_lddqu_si256 (in + 11);
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 17));
|
|
w1 = _mm256_srli_epi32(tmp,15);
|
|
_mm256_storeu_si256(compressed + 6, w0);
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 12) , 4));
|
|
tmp = _mm256_lddqu_si256 (in + 13);
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 23));
|
|
w0 = _mm256_srli_epi32(tmp,9);
|
|
_mm256_storeu_si256(compressed + 7, w1);
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 14) , 10));
|
|
tmp = _mm256_lddqu_si256 (in + 15);
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 29));
|
|
w1 = _mm256_srli_epi32(tmp,3);
|
|
_mm256_storeu_si256(compressed + 8, w0);
|
|
tmp = _mm256_lddqu_si256 (in + 16);
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 16));
|
|
w0 = _mm256_srli_epi32(tmp,16);
|
|
_mm256_storeu_si256(compressed + 9, w1);
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 17) , 3));
|
|
tmp = _mm256_lddqu_si256 (in + 18);
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 22));
|
|
w1 = _mm256_srli_epi32(tmp,10);
|
|
_mm256_storeu_si256(compressed + 10, w0);
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 19) , 9));
|
|
tmp = _mm256_lddqu_si256 (in + 20);
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 28));
|
|
w0 = _mm256_srli_epi32(tmp,4);
|
|
_mm256_storeu_si256(compressed + 11, w1);
|
|
tmp = _mm256_lddqu_si256 (in + 21);
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 15));
|
|
w1 = _mm256_srli_epi32(tmp,17);
|
|
_mm256_storeu_si256(compressed + 12, w0);
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 22) , 2));
|
|
tmp = _mm256_lddqu_si256 (in + 23);
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 21));
|
|
w0 = _mm256_srli_epi32(tmp,11);
|
|
_mm256_storeu_si256(compressed + 13, w1);
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 24) , 8));
|
|
tmp = _mm256_lddqu_si256 (in + 25);
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 27));
|
|
w1 = _mm256_srli_epi32(tmp,5);
|
|
_mm256_storeu_si256(compressed + 14, w0);
|
|
tmp = _mm256_lddqu_si256 (in + 26);
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 14));
|
|
w0 = _mm256_srli_epi32(tmp,18);
|
|
_mm256_storeu_si256(compressed + 15, w1);
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 27) , 1));
|
|
tmp = _mm256_lddqu_si256 (in + 28);
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 20));
|
|
w1 = _mm256_srli_epi32(tmp,12);
|
|
_mm256_storeu_si256(compressed + 16, w0);
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 29) , 7));
|
|
tmp = _mm256_lddqu_si256 (in + 30);
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 26));
|
|
w0 = _mm256_srli_epi32(tmp,6);
|
|
_mm256_storeu_si256(compressed + 17, w1);
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 31) , 13));
|
|
_mm256_storeu_si256(compressed + 18, w0);
|
|
}
|
|
|
|
|
|
/* we are going to pack 256 20-bit values, touching 20 256-bit words, using 320 bytes */
|
|
static void avxpackblock20(const uint32_t * pin, __m256i * compressed) {
|
|
const __m256i * in = (const __m256i *) pin;
|
|
/* we are going to touch 20 256-bit words */
|
|
__m256i w0, w1;
|
|
__m256i tmp; /* used to store inputs at word boundary */
|
|
w0 = _mm256_lddqu_si256 (in + 0);
|
|
tmp = _mm256_lddqu_si256 (in + 1);
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 20));
|
|
w1 = _mm256_srli_epi32(tmp,12);
|
|
_mm256_storeu_si256(compressed + 0, w0);
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 2) , 8));
|
|
tmp = _mm256_lddqu_si256 (in + 3);
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 28));
|
|
w0 = _mm256_srli_epi32(tmp,4);
|
|
_mm256_storeu_si256(compressed + 1, w1);
|
|
tmp = _mm256_lddqu_si256 (in + 4);
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 16));
|
|
w1 = _mm256_srli_epi32(tmp,16);
|
|
_mm256_storeu_si256(compressed + 2, w0);
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 5) , 4));
|
|
tmp = _mm256_lddqu_si256 (in + 6);
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 24));
|
|
w0 = _mm256_srli_epi32(tmp,8);
|
|
_mm256_storeu_si256(compressed + 3, w1);
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 7) , 12));
|
|
_mm256_storeu_si256(compressed + 4, w0);
|
|
w1 = _mm256_lddqu_si256 (in + 8);
|
|
tmp = _mm256_lddqu_si256 (in + 9);
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 20));
|
|
w0 = _mm256_srli_epi32(tmp,12);
|
|
_mm256_storeu_si256(compressed + 5, w1);
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 10) , 8));
|
|
tmp = _mm256_lddqu_si256 (in + 11);
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 28));
|
|
w1 = _mm256_srli_epi32(tmp,4);
|
|
_mm256_storeu_si256(compressed + 6, w0);
|
|
tmp = _mm256_lddqu_si256 (in + 12);
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 16));
|
|
w0 = _mm256_srli_epi32(tmp,16);
|
|
_mm256_storeu_si256(compressed + 7, w1);
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 13) , 4));
|
|
tmp = _mm256_lddqu_si256 (in + 14);
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 24));
|
|
w1 = _mm256_srli_epi32(tmp,8);
|
|
_mm256_storeu_si256(compressed + 8, w0);
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 15) , 12));
|
|
_mm256_storeu_si256(compressed + 9, w1);
|
|
w0 = _mm256_lddqu_si256 (in + 16);
|
|
tmp = _mm256_lddqu_si256 (in + 17);
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 20));
|
|
w1 = _mm256_srli_epi32(tmp,12);
|
|
_mm256_storeu_si256(compressed + 10, w0);
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 18) , 8));
|
|
tmp = _mm256_lddqu_si256 (in + 19);
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 28));
|
|
w0 = _mm256_srli_epi32(tmp,4);
|
|
_mm256_storeu_si256(compressed + 11, w1);
|
|
tmp = _mm256_lddqu_si256 (in + 20);
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 16));
|
|
w1 = _mm256_srli_epi32(tmp,16);
|
|
_mm256_storeu_si256(compressed + 12, w0);
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 21) , 4));
|
|
tmp = _mm256_lddqu_si256 (in + 22);
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 24));
|
|
w0 = _mm256_srli_epi32(tmp,8);
|
|
_mm256_storeu_si256(compressed + 13, w1);
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 23) , 12));
|
|
_mm256_storeu_si256(compressed + 14, w0);
|
|
w1 = _mm256_lddqu_si256 (in + 24);
|
|
tmp = _mm256_lddqu_si256 (in + 25);
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 20));
|
|
w0 = _mm256_srli_epi32(tmp,12);
|
|
_mm256_storeu_si256(compressed + 15, w1);
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 26) , 8));
|
|
tmp = _mm256_lddqu_si256 (in + 27);
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 28));
|
|
w1 = _mm256_srli_epi32(tmp,4);
|
|
_mm256_storeu_si256(compressed + 16, w0);
|
|
tmp = _mm256_lddqu_si256 (in + 28);
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 16));
|
|
w0 = _mm256_srli_epi32(tmp,16);
|
|
_mm256_storeu_si256(compressed + 17, w1);
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 29) , 4));
|
|
tmp = _mm256_lddqu_si256 (in + 30);
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 24));
|
|
w1 = _mm256_srli_epi32(tmp,8);
|
|
_mm256_storeu_si256(compressed + 18, w0);
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 31) , 12));
|
|
_mm256_storeu_si256(compressed + 19, w1);
|
|
}
|
|
|
|
|
|
/* we are going to pack 256 21-bit values, touching 21 256-bit words, using 336 bytes */
|
|
static void avxpackblock21(const uint32_t * pin, __m256i * compressed) {
|
|
const __m256i * in = (const __m256i *) pin;
|
|
/* we are going to touch 21 256-bit words */
|
|
__m256i w0, w1;
|
|
__m256i tmp; /* used to store inputs at word boundary */
|
|
w0 = _mm256_lddqu_si256 (in + 0);
|
|
tmp = _mm256_lddqu_si256 (in + 1);
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 21));
|
|
w1 = _mm256_srli_epi32(tmp,11);
|
|
_mm256_storeu_si256(compressed + 0, w0);
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 2) , 10));
|
|
tmp = _mm256_lddqu_si256 (in + 3);
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 31));
|
|
w0 = _mm256_srli_epi32(tmp,1);
|
|
_mm256_storeu_si256(compressed + 1, w1);
|
|
tmp = _mm256_lddqu_si256 (in + 4);
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 20));
|
|
w1 = _mm256_srli_epi32(tmp,12);
|
|
_mm256_storeu_si256(compressed + 2, w0);
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 5) , 9));
|
|
tmp = _mm256_lddqu_si256 (in + 6);
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 30));
|
|
w0 = _mm256_srli_epi32(tmp,2);
|
|
_mm256_storeu_si256(compressed + 3, w1);
|
|
tmp = _mm256_lddqu_si256 (in + 7);
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 19));
|
|
w1 = _mm256_srli_epi32(tmp,13);
|
|
_mm256_storeu_si256(compressed + 4, w0);
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 8) , 8));
|
|
tmp = _mm256_lddqu_si256 (in + 9);
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 29));
|
|
w0 = _mm256_srli_epi32(tmp,3);
|
|
_mm256_storeu_si256(compressed + 5, w1);
|
|
tmp = _mm256_lddqu_si256 (in + 10);
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 18));
|
|
w1 = _mm256_srli_epi32(tmp,14);
|
|
_mm256_storeu_si256(compressed + 6, w0);
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 11) , 7));
|
|
tmp = _mm256_lddqu_si256 (in + 12);
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 28));
|
|
w0 = _mm256_srli_epi32(tmp,4);
|
|
_mm256_storeu_si256(compressed + 7, w1);
|
|
tmp = _mm256_lddqu_si256 (in + 13);
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 17));
|
|
w1 = _mm256_srli_epi32(tmp,15);
|
|
_mm256_storeu_si256(compressed + 8, w0);
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 14) , 6));
|
|
tmp = _mm256_lddqu_si256 (in + 15);
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 27));
|
|
w0 = _mm256_srli_epi32(tmp,5);
|
|
_mm256_storeu_si256(compressed + 9, w1);
|
|
tmp = _mm256_lddqu_si256 (in + 16);
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 16));
|
|
w1 = _mm256_srli_epi32(tmp,16);
|
|
_mm256_storeu_si256(compressed + 10, w0);
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 17) , 5));
|
|
tmp = _mm256_lddqu_si256 (in + 18);
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 26));
|
|
w0 = _mm256_srli_epi32(tmp,6);
|
|
_mm256_storeu_si256(compressed + 11, w1);
|
|
tmp = _mm256_lddqu_si256 (in + 19);
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 15));
|
|
w1 = _mm256_srli_epi32(tmp,17);
|
|
_mm256_storeu_si256(compressed + 12, w0);
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 20) , 4));
|
|
tmp = _mm256_lddqu_si256 (in + 21);
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 25));
|
|
w0 = _mm256_srli_epi32(tmp,7);
|
|
_mm256_storeu_si256(compressed + 13, w1);
|
|
tmp = _mm256_lddqu_si256 (in + 22);
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 14));
|
|
w1 = _mm256_srli_epi32(tmp,18);
|
|
_mm256_storeu_si256(compressed + 14, w0);
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 23) , 3));
|
|
tmp = _mm256_lddqu_si256 (in + 24);
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 24));
|
|
w0 = _mm256_srli_epi32(tmp,8);
|
|
_mm256_storeu_si256(compressed + 15, w1);
|
|
tmp = _mm256_lddqu_si256 (in + 25);
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 13));
|
|
w1 = _mm256_srli_epi32(tmp,19);
|
|
_mm256_storeu_si256(compressed + 16, w0);
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 26) , 2));
|
|
tmp = _mm256_lddqu_si256 (in + 27);
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 23));
|
|
w0 = _mm256_srli_epi32(tmp,9);
|
|
_mm256_storeu_si256(compressed + 17, w1);
|
|
tmp = _mm256_lddqu_si256 (in + 28);
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 12));
|
|
w1 = _mm256_srli_epi32(tmp,20);
|
|
_mm256_storeu_si256(compressed + 18, w0);
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 29) , 1));
|
|
tmp = _mm256_lddqu_si256 (in + 30);
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 22));
|
|
w0 = _mm256_srli_epi32(tmp,10);
|
|
_mm256_storeu_si256(compressed + 19, w1);
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 31) , 11));
|
|
_mm256_storeu_si256(compressed + 20, w0);
|
|
}
|
|
|
|
|
|
/* we are going to pack 256 22-bit values, touching 22 256-bit words, using 352 bytes */
|
|
static void avxpackblock22(const uint32_t * pin, __m256i * compressed) {
|
|
const __m256i * in = (const __m256i *) pin;
|
|
/* we are going to touch 22 256-bit words */
|
|
__m256i w0, w1;
|
|
__m256i tmp; /* used to store inputs at word boundary */
|
|
w0 = _mm256_lddqu_si256 (in + 0);
|
|
tmp = _mm256_lddqu_si256 (in + 1);
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 22));
|
|
w1 = _mm256_srli_epi32(tmp,10);
|
|
_mm256_storeu_si256(compressed + 0, w0);
|
|
tmp = _mm256_lddqu_si256 (in + 2);
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 12));
|
|
w0 = _mm256_srli_epi32(tmp,20);
|
|
_mm256_storeu_si256(compressed + 1, w1);
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 3) , 2));
|
|
tmp = _mm256_lddqu_si256 (in + 4);
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 24));
|
|
w1 = _mm256_srli_epi32(tmp,8);
|
|
_mm256_storeu_si256(compressed + 2, w0);
|
|
tmp = _mm256_lddqu_si256 (in + 5);
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 14));
|
|
w0 = _mm256_srli_epi32(tmp,18);
|
|
_mm256_storeu_si256(compressed + 3, w1);
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 6) , 4));
|
|
tmp = _mm256_lddqu_si256 (in + 7);
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 26));
|
|
w1 = _mm256_srli_epi32(tmp,6);
|
|
_mm256_storeu_si256(compressed + 4, w0);
|
|
tmp = _mm256_lddqu_si256 (in + 8);
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 16));
|
|
w0 = _mm256_srli_epi32(tmp,16);
|
|
_mm256_storeu_si256(compressed + 5, w1);
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 9) , 6));
|
|
tmp = _mm256_lddqu_si256 (in + 10);
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 28));
|
|
w1 = _mm256_srli_epi32(tmp,4);
|
|
_mm256_storeu_si256(compressed + 6, w0);
|
|
tmp = _mm256_lddqu_si256 (in + 11);
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 18));
|
|
w0 = _mm256_srli_epi32(tmp,14);
|
|
_mm256_storeu_si256(compressed + 7, w1);
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 12) , 8));
|
|
tmp = _mm256_lddqu_si256 (in + 13);
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 30));
|
|
w1 = _mm256_srli_epi32(tmp,2);
|
|
_mm256_storeu_si256(compressed + 8, w0);
|
|
tmp = _mm256_lddqu_si256 (in + 14);
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 20));
|
|
w0 = _mm256_srli_epi32(tmp,12);
|
|
_mm256_storeu_si256(compressed + 9, w1);
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 15) , 10));
|
|
_mm256_storeu_si256(compressed + 10, w0);
|
|
w1 = _mm256_lddqu_si256 (in + 16);
|
|
tmp = _mm256_lddqu_si256 (in + 17);
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 22));
|
|
w0 = _mm256_srli_epi32(tmp,10);
|
|
_mm256_storeu_si256(compressed + 11, w1);
|
|
tmp = _mm256_lddqu_si256 (in + 18);
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 12));
|
|
w1 = _mm256_srli_epi32(tmp,20);
|
|
_mm256_storeu_si256(compressed + 12, w0);
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 19) , 2));
|
|
tmp = _mm256_lddqu_si256 (in + 20);
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 24));
|
|
w0 = _mm256_srli_epi32(tmp,8);
|
|
_mm256_storeu_si256(compressed + 13, w1);
|
|
tmp = _mm256_lddqu_si256 (in + 21);
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 14));
|
|
w1 = _mm256_srli_epi32(tmp,18);
|
|
_mm256_storeu_si256(compressed + 14, w0);
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 22) , 4));
|
|
tmp = _mm256_lddqu_si256 (in + 23);
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 26));
|
|
w0 = _mm256_srli_epi32(tmp,6);
|
|
_mm256_storeu_si256(compressed + 15, w1);
|
|
tmp = _mm256_lddqu_si256 (in + 24);
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 16));
|
|
w1 = _mm256_srli_epi32(tmp,16);
|
|
_mm256_storeu_si256(compressed + 16, w0);
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 25) , 6));
|
|
tmp = _mm256_lddqu_si256 (in + 26);
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 28));
|
|
w0 = _mm256_srli_epi32(tmp,4);
|
|
_mm256_storeu_si256(compressed + 17, w1);
|
|
tmp = _mm256_lddqu_si256 (in + 27);
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 18));
|
|
w1 = _mm256_srli_epi32(tmp,14);
|
|
_mm256_storeu_si256(compressed + 18, w0);
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 28) , 8));
|
|
tmp = _mm256_lddqu_si256 (in + 29);
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 30));
|
|
w0 = _mm256_srli_epi32(tmp,2);
|
|
_mm256_storeu_si256(compressed + 19, w1);
|
|
tmp = _mm256_lddqu_si256 (in + 30);
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 20));
|
|
w1 = _mm256_srli_epi32(tmp,12);
|
|
_mm256_storeu_si256(compressed + 20, w0);
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 31) , 10));
|
|
_mm256_storeu_si256(compressed + 21, w1);
|
|
}
|
|
|
|
|
|
/* we are going to pack 256 23-bit values, touching 23 256-bit words, using 368 bytes */
|
|
static void avxpackblock23(const uint32_t * pin, __m256i * compressed) {
|
|
const __m256i * in = (const __m256i *) pin;
|
|
/* we are going to touch 23 256-bit words */
|
|
__m256i w0, w1;
|
|
__m256i tmp; /* used to store inputs at word boundary */
|
|
w0 = _mm256_lddqu_si256 (in + 0);
|
|
tmp = _mm256_lddqu_si256 (in + 1);
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 23));
|
|
w1 = _mm256_srli_epi32(tmp,9);
|
|
_mm256_storeu_si256(compressed + 0, w0);
|
|
tmp = _mm256_lddqu_si256 (in + 2);
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 14));
|
|
w0 = _mm256_srli_epi32(tmp,18);
|
|
_mm256_storeu_si256(compressed + 1, w1);
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 3) , 5));
|
|
tmp = _mm256_lddqu_si256 (in + 4);
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 28));
|
|
w1 = _mm256_srli_epi32(tmp,4);
|
|
_mm256_storeu_si256(compressed + 2, w0);
|
|
tmp = _mm256_lddqu_si256 (in + 5);
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 19));
|
|
w0 = _mm256_srli_epi32(tmp,13);
|
|
_mm256_storeu_si256(compressed + 3, w1);
|
|
tmp = _mm256_lddqu_si256 (in + 6);
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 10));
|
|
w1 = _mm256_srli_epi32(tmp,22);
|
|
_mm256_storeu_si256(compressed + 4, w0);
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 7) , 1));
|
|
tmp = _mm256_lddqu_si256 (in + 8);
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 24));
|
|
w0 = _mm256_srli_epi32(tmp,8);
|
|
_mm256_storeu_si256(compressed + 5, w1);
|
|
tmp = _mm256_lddqu_si256 (in + 9);
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 15));
|
|
w1 = _mm256_srli_epi32(tmp,17);
|
|
_mm256_storeu_si256(compressed + 6, w0);
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 10) , 6));
|
|
tmp = _mm256_lddqu_si256 (in + 11);
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 29));
|
|
w0 = _mm256_srli_epi32(tmp,3);
|
|
_mm256_storeu_si256(compressed + 7, w1);
|
|
tmp = _mm256_lddqu_si256 (in + 12);
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 20));
|
|
w1 = _mm256_srli_epi32(tmp,12);
|
|
_mm256_storeu_si256(compressed + 8, w0);
|
|
tmp = _mm256_lddqu_si256 (in + 13);
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 11));
|
|
w0 = _mm256_srli_epi32(tmp,21);
|
|
_mm256_storeu_si256(compressed + 9, w1);
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 14) , 2));
|
|
tmp = _mm256_lddqu_si256 (in + 15);
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 25));
|
|
w1 = _mm256_srli_epi32(tmp,7);
|
|
_mm256_storeu_si256(compressed + 10, w0);
|
|
tmp = _mm256_lddqu_si256 (in + 16);
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 16));
|
|
w0 = _mm256_srli_epi32(tmp,16);
|
|
_mm256_storeu_si256(compressed + 11, w1);
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 17) , 7));
|
|
tmp = _mm256_lddqu_si256 (in + 18);
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 30));
|
|
w1 = _mm256_srli_epi32(tmp,2);
|
|
_mm256_storeu_si256(compressed + 12, w0);
|
|
tmp = _mm256_lddqu_si256 (in + 19);
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 21));
|
|
w0 = _mm256_srli_epi32(tmp,11);
|
|
_mm256_storeu_si256(compressed + 13, w1);
|
|
tmp = _mm256_lddqu_si256 (in + 20);
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 12));
|
|
w1 = _mm256_srli_epi32(tmp,20);
|
|
_mm256_storeu_si256(compressed + 14, w0);
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 21) , 3));
|
|
tmp = _mm256_lddqu_si256 (in + 22);
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 26));
|
|
w0 = _mm256_srli_epi32(tmp,6);
|
|
_mm256_storeu_si256(compressed + 15, w1);
|
|
tmp = _mm256_lddqu_si256 (in + 23);
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 17));
|
|
w1 = _mm256_srli_epi32(tmp,15);
|
|
_mm256_storeu_si256(compressed + 16, w0);
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 24) , 8));
|
|
tmp = _mm256_lddqu_si256 (in + 25);
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 31));
|
|
w0 = _mm256_srli_epi32(tmp,1);
|
|
_mm256_storeu_si256(compressed + 17, w1);
|
|
tmp = _mm256_lddqu_si256 (in + 26);
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 22));
|
|
w1 = _mm256_srli_epi32(tmp,10);
|
|
_mm256_storeu_si256(compressed + 18, w0);
|
|
tmp = _mm256_lddqu_si256 (in + 27);
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 13));
|
|
w0 = _mm256_srli_epi32(tmp,19);
|
|
_mm256_storeu_si256(compressed + 19, w1);
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 28) , 4));
|
|
tmp = _mm256_lddqu_si256 (in + 29);
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 27));
|
|
w1 = _mm256_srli_epi32(tmp,5);
|
|
_mm256_storeu_si256(compressed + 20, w0);
|
|
tmp = _mm256_lddqu_si256 (in + 30);
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 18));
|
|
w0 = _mm256_srli_epi32(tmp,14);
|
|
_mm256_storeu_si256(compressed + 21, w1);
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 31) , 9));
|
|
_mm256_storeu_si256(compressed + 22, w0);
|
|
}
|
|
|
|
|
|
/* we are going to pack 256 24-bit values, touching 24 256-bit words, using 384 bytes */
|
|
static void avxpackblock24(const uint32_t * pin, __m256i * compressed) {
|
|
const __m256i * in = (const __m256i *) pin;
|
|
/* we are going to touch 24 256-bit words */
|
|
__m256i w0, w1;
|
|
__m256i tmp; /* used to store inputs at word boundary */
|
|
w0 = _mm256_lddqu_si256 (in + 0);
|
|
tmp = _mm256_lddqu_si256 (in + 1);
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 24));
|
|
w1 = _mm256_srli_epi32(tmp,8);
|
|
_mm256_storeu_si256(compressed + 0, w0);
|
|
tmp = _mm256_lddqu_si256 (in + 2);
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 16));
|
|
w0 = _mm256_srli_epi32(tmp,16);
|
|
_mm256_storeu_si256(compressed + 1, w1);
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 3) , 8));
|
|
_mm256_storeu_si256(compressed + 2, w0);
|
|
w1 = _mm256_lddqu_si256 (in + 4);
|
|
tmp = _mm256_lddqu_si256 (in + 5);
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 24));
|
|
w0 = _mm256_srli_epi32(tmp,8);
|
|
_mm256_storeu_si256(compressed + 3, w1);
|
|
tmp = _mm256_lddqu_si256 (in + 6);
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 16));
|
|
w1 = _mm256_srli_epi32(tmp,16);
|
|
_mm256_storeu_si256(compressed + 4, w0);
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 7) , 8));
|
|
_mm256_storeu_si256(compressed + 5, w1);
|
|
w0 = _mm256_lddqu_si256 (in + 8);
|
|
tmp = _mm256_lddqu_si256 (in + 9);
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 24));
|
|
w1 = _mm256_srli_epi32(tmp,8);
|
|
_mm256_storeu_si256(compressed + 6, w0);
|
|
tmp = _mm256_lddqu_si256 (in + 10);
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 16));
|
|
w0 = _mm256_srli_epi32(tmp,16);
|
|
_mm256_storeu_si256(compressed + 7, w1);
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 11) , 8));
|
|
_mm256_storeu_si256(compressed + 8, w0);
|
|
w1 = _mm256_lddqu_si256 (in + 12);
|
|
tmp = _mm256_lddqu_si256 (in + 13);
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 24));
|
|
w0 = _mm256_srli_epi32(tmp,8);
|
|
_mm256_storeu_si256(compressed + 9, w1);
|
|
tmp = _mm256_lddqu_si256 (in + 14);
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 16));
|
|
w1 = _mm256_srli_epi32(tmp,16);
|
|
_mm256_storeu_si256(compressed + 10, w0);
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 15) , 8));
|
|
_mm256_storeu_si256(compressed + 11, w1);
|
|
w0 = _mm256_lddqu_si256 (in + 16);
|
|
tmp = _mm256_lddqu_si256 (in + 17);
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 24));
|
|
w1 = _mm256_srli_epi32(tmp,8);
|
|
_mm256_storeu_si256(compressed + 12, w0);
|
|
tmp = _mm256_lddqu_si256 (in + 18);
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 16));
|
|
w0 = _mm256_srli_epi32(tmp,16);
|
|
_mm256_storeu_si256(compressed + 13, w1);
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 19) , 8));
|
|
_mm256_storeu_si256(compressed + 14, w0);
|
|
w1 = _mm256_lddqu_si256 (in + 20);
|
|
tmp = _mm256_lddqu_si256 (in + 21);
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 24));
|
|
w0 = _mm256_srli_epi32(tmp,8);
|
|
_mm256_storeu_si256(compressed + 15, w1);
|
|
tmp = _mm256_lddqu_si256 (in + 22);
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 16));
|
|
w1 = _mm256_srli_epi32(tmp,16);
|
|
_mm256_storeu_si256(compressed + 16, w0);
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 23) , 8));
|
|
_mm256_storeu_si256(compressed + 17, w1);
|
|
w0 = _mm256_lddqu_si256 (in + 24);
|
|
tmp = _mm256_lddqu_si256 (in + 25);
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 24));
|
|
w1 = _mm256_srli_epi32(tmp,8);
|
|
_mm256_storeu_si256(compressed + 18, w0);
|
|
tmp = _mm256_lddqu_si256 (in + 26);
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 16));
|
|
w0 = _mm256_srli_epi32(tmp,16);
|
|
_mm256_storeu_si256(compressed + 19, w1);
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 27) , 8));
|
|
_mm256_storeu_si256(compressed + 20, w0);
|
|
w1 = _mm256_lddqu_si256 (in + 28);
|
|
tmp = _mm256_lddqu_si256 (in + 29);
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 24));
|
|
w0 = _mm256_srli_epi32(tmp,8);
|
|
_mm256_storeu_si256(compressed + 21, w1);
|
|
tmp = _mm256_lddqu_si256 (in + 30);
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 16));
|
|
w1 = _mm256_srli_epi32(tmp,16);
|
|
_mm256_storeu_si256(compressed + 22, w0);
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 31) , 8));
|
|
_mm256_storeu_si256(compressed + 23, w1);
|
|
}
|
|
|
|
|
|
/* we are going to pack 256 25-bit values, touching 25 256-bit words, using 400 bytes */
|
|
static void avxpackblock25(const uint32_t * pin, __m256i * compressed) {
|
|
const __m256i * in = (const __m256i *) pin;
|
|
/* we are going to touch 25 256-bit words */
|
|
__m256i w0, w1;
|
|
__m256i tmp; /* used to store inputs at word boundary */
|
|
w0 = _mm256_lddqu_si256 (in + 0);
|
|
tmp = _mm256_lddqu_si256 (in + 1);
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 25));
|
|
w1 = _mm256_srli_epi32(tmp,7);
|
|
_mm256_storeu_si256(compressed + 0, w0);
|
|
tmp = _mm256_lddqu_si256 (in + 2);
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 18));
|
|
w0 = _mm256_srli_epi32(tmp,14);
|
|
_mm256_storeu_si256(compressed + 1, w1);
|
|
tmp = _mm256_lddqu_si256 (in + 3);
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 11));
|
|
w1 = _mm256_srli_epi32(tmp,21);
|
|
_mm256_storeu_si256(compressed + 2, w0);
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 4) , 4));
|
|
tmp = _mm256_lddqu_si256 (in + 5);
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 29));
|
|
w0 = _mm256_srli_epi32(tmp,3);
|
|
_mm256_storeu_si256(compressed + 3, w1);
|
|
tmp = _mm256_lddqu_si256 (in + 6);
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 22));
|
|
w1 = _mm256_srli_epi32(tmp,10);
|
|
_mm256_storeu_si256(compressed + 4, w0);
|
|
tmp = _mm256_lddqu_si256 (in + 7);
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 15));
|
|
w0 = _mm256_srli_epi32(tmp,17);
|
|
_mm256_storeu_si256(compressed + 5, w1);
|
|
tmp = _mm256_lddqu_si256 (in + 8);
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 8));
|
|
w1 = _mm256_srli_epi32(tmp,24);
|
|
_mm256_storeu_si256(compressed + 6, w0);
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 9) , 1));
|
|
tmp = _mm256_lddqu_si256 (in + 10);
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 26));
|
|
w0 = _mm256_srli_epi32(tmp,6);
|
|
_mm256_storeu_si256(compressed + 7, w1);
|
|
tmp = _mm256_lddqu_si256 (in + 11);
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 19));
|
|
w1 = _mm256_srli_epi32(tmp,13);
|
|
_mm256_storeu_si256(compressed + 8, w0);
|
|
tmp = _mm256_lddqu_si256 (in + 12);
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 12));
|
|
w0 = _mm256_srli_epi32(tmp,20);
|
|
_mm256_storeu_si256(compressed + 9, w1);
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 13) , 5));
|
|
tmp = _mm256_lddqu_si256 (in + 14);
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 30));
|
|
w1 = _mm256_srli_epi32(tmp,2);
|
|
_mm256_storeu_si256(compressed + 10, w0);
|
|
tmp = _mm256_lddqu_si256 (in + 15);
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 23));
|
|
w0 = _mm256_srli_epi32(tmp,9);
|
|
_mm256_storeu_si256(compressed + 11, w1);
|
|
tmp = _mm256_lddqu_si256 (in + 16);
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 16));
|
|
w1 = _mm256_srli_epi32(tmp,16);
|
|
_mm256_storeu_si256(compressed + 12, w0);
|
|
tmp = _mm256_lddqu_si256 (in + 17);
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 9));
|
|
w0 = _mm256_srli_epi32(tmp,23);
|
|
_mm256_storeu_si256(compressed + 13, w1);
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 18) , 2));
|
|
tmp = _mm256_lddqu_si256 (in + 19);
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 27));
|
|
w1 = _mm256_srli_epi32(tmp,5);
|
|
_mm256_storeu_si256(compressed + 14, w0);
|
|
tmp = _mm256_lddqu_si256 (in + 20);
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 20));
|
|
w0 = _mm256_srli_epi32(tmp,12);
|
|
_mm256_storeu_si256(compressed + 15, w1);
|
|
tmp = _mm256_lddqu_si256 (in + 21);
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 13));
|
|
w1 = _mm256_srli_epi32(tmp,19);
|
|
_mm256_storeu_si256(compressed + 16, w0);
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 22) , 6));
|
|
tmp = _mm256_lddqu_si256 (in + 23);
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 31));
|
|
w0 = _mm256_srli_epi32(tmp,1);
|
|
_mm256_storeu_si256(compressed + 17, w1);
|
|
tmp = _mm256_lddqu_si256 (in + 24);
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 24));
|
|
w1 = _mm256_srli_epi32(tmp,8);
|
|
_mm256_storeu_si256(compressed + 18, w0);
|
|
tmp = _mm256_lddqu_si256 (in + 25);
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 17));
|
|
w0 = _mm256_srli_epi32(tmp,15);
|
|
_mm256_storeu_si256(compressed + 19, w1);
|
|
tmp = _mm256_lddqu_si256 (in + 26);
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 10));
|
|
w1 = _mm256_srli_epi32(tmp,22);
|
|
_mm256_storeu_si256(compressed + 20, w0);
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 27) , 3));
|
|
tmp = _mm256_lddqu_si256 (in + 28);
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 28));
|
|
w0 = _mm256_srli_epi32(tmp,4);
|
|
_mm256_storeu_si256(compressed + 21, w1);
|
|
tmp = _mm256_lddqu_si256 (in + 29);
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 21));
|
|
w1 = _mm256_srli_epi32(tmp,11);
|
|
_mm256_storeu_si256(compressed + 22, w0);
|
|
tmp = _mm256_lddqu_si256 (in + 30);
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 14));
|
|
w0 = _mm256_srli_epi32(tmp,18);
|
|
_mm256_storeu_si256(compressed + 23, w1);
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 31) , 7));
|
|
_mm256_storeu_si256(compressed + 24, w0);
|
|
}
|
|
|
|
|
|
/* we are going to pack 256 26-bit values, touching 26 256-bit words, using 416 bytes */
|
|
static void avxpackblock26(const uint32_t * pin, __m256i * compressed) {
|
|
const __m256i * in = (const __m256i *) pin;
|
|
/* we are going to touch 26 256-bit words */
|
|
__m256i w0, w1;
|
|
__m256i tmp; /* used to store inputs at word boundary */
|
|
w0 = _mm256_lddqu_si256 (in + 0);
|
|
tmp = _mm256_lddqu_si256 (in + 1);
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 26));
|
|
w1 = _mm256_srli_epi32(tmp,6);
|
|
_mm256_storeu_si256(compressed + 0, w0);
|
|
tmp = _mm256_lddqu_si256 (in + 2);
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 20));
|
|
w0 = _mm256_srli_epi32(tmp,12);
|
|
_mm256_storeu_si256(compressed + 1, w1);
|
|
tmp = _mm256_lddqu_si256 (in + 3);
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 14));
|
|
w1 = _mm256_srli_epi32(tmp,18);
|
|
_mm256_storeu_si256(compressed + 2, w0);
|
|
tmp = _mm256_lddqu_si256 (in + 4);
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 8));
|
|
w0 = _mm256_srli_epi32(tmp,24);
|
|
_mm256_storeu_si256(compressed + 3, w1);
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 5) , 2));
|
|
tmp = _mm256_lddqu_si256 (in + 6);
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 28));
|
|
w1 = _mm256_srli_epi32(tmp,4);
|
|
_mm256_storeu_si256(compressed + 4, w0);
|
|
tmp = _mm256_lddqu_si256 (in + 7);
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 22));
|
|
w0 = _mm256_srli_epi32(tmp,10);
|
|
_mm256_storeu_si256(compressed + 5, w1);
|
|
tmp = _mm256_lddqu_si256 (in + 8);
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 16));
|
|
w1 = _mm256_srli_epi32(tmp,16);
|
|
_mm256_storeu_si256(compressed + 6, w0);
|
|
tmp = _mm256_lddqu_si256 (in + 9);
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 10));
|
|
w0 = _mm256_srli_epi32(tmp,22);
|
|
_mm256_storeu_si256(compressed + 7, w1);
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 10) , 4));
|
|
tmp = _mm256_lddqu_si256 (in + 11);
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 30));
|
|
w1 = _mm256_srli_epi32(tmp,2);
|
|
_mm256_storeu_si256(compressed + 8, w0);
|
|
tmp = _mm256_lddqu_si256 (in + 12);
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 24));
|
|
w0 = _mm256_srli_epi32(tmp,8);
|
|
_mm256_storeu_si256(compressed + 9, w1);
|
|
tmp = _mm256_lddqu_si256 (in + 13);
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 18));
|
|
w1 = _mm256_srli_epi32(tmp,14);
|
|
_mm256_storeu_si256(compressed + 10, w0);
|
|
tmp = _mm256_lddqu_si256 (in + 14);
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 12));
|
|
w0 = _mm256_srli_epi32(tmp,20);
|
|
_mm256_storeu_si256(compressed + 11, w1);
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 15) , 6));
|
|
_mm256_storeu_si256(compressed + 12, w0);
|
|
w1 = _mm256_lddqu_si256 (in + 16);
|
|
tmp = _mm256_lddqu_si256 (in + 17);
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 26));
|
|
w0 = _mm256_srli_epi32(tmp,6);
|
|
_mm256_storeu_si256(compressed + 13, w1);
|
|
tmp = _mm256_lddqu_si256 (in + 18);
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 20));
|
|
w1 = _mm256_srli_epi32(tmp,12);
|
|
_mm256_storeu_si256(compressed + 14, w0);
|
|
tmp = _mm256_lddqu_si256 (in + 19);
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 14));
|
|
w0 = _mm256_srli_epi32(tmp,18);
|
|
_mm256_storeu_si256(compressed + 15, w1);
|
|
tmp = _mm256_lddqu_si256 (in + 20);
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 8));
|
|
w1 = _mm256_srli_epi32(tmp,24);
|
|
_mm256_storeu_si256(compressed + 16, w0);
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 21) , 2));
|
|
tmp = _mm256_lddqu_si256 (in + 22);
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 28));
|
|
w0 = _mm256_srli_epi32(tmp,4);
|
|
_mm256_storeu_si256(compressed + 17, w1);
|
|
tmp = _mm256_lddqu_si256 (in + 23);
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 22));
|
|
w1 = _mm256_srli_epi32(tmp,10);
|
|
_mm256_storeu_si256(compressed + 18, w0);
|
|
tmp = _mm256_lddqu_si256 (in + 24);
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 16));
|
|
w0 = _mm256_srli_epi32(tmp,16);
|
|
_mm256_storeu_si256(compressed + 19, w1);
|
|
tmp = _mm256_lddqu_si256 (in + 25);
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 10));
|
|
w1 = _mm256_srli_epi32(tmp,22);
|
|
_mm256_storeu_si256(compressed + 20, w0);
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 26) , 4));
|
|
tmp = _mm256_lddqu_si256 (in + 27);
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 30));
|
|
w0 = _mm256_srli_epi32(tmp,2);
|
|
_mm256_storeu_si256(compressed + 21, w1);
|
|
tmp = _mm256_lddqu_si256 (in + 28);
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 24));
|
|
w1 = _mm256_srli_epi32(tmp,8);
|
|
_mm256_storeu_si256(compressed + 22, w0);
|
|
tmp = _mm256_lddqu_si256 (in + 29);
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 18));
|
|
w0 = _mm256_srli_epi32(tmp,14);
|
|
_mm256_storeu_si256(compressed + 23, w1);
|
|
tmp = _mm256_lddqu_si256 (in + 30);
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 12));
|
|
w1 = _mm256_srli_epi32(tmp,20);
|
|
_mm256_storeu_si256(compressed + 24, w0);
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 31) , 6));
|
|
_mm256_storeu_si256(compressed + 25, w1);
|
|
}
|
|
|
|
|
|
/* we are going to pack 256 27-bit values, touching 27 256-bit words, using 432 bytes */
|
|
static void avxpackblock27(const uint32_t * pin, __m256i * compressed) {
|
|
const __m256i * in = (const __m256i *) pin;
|
|
/* we are going to touch 27 256-bit words */
|
|
__m256i w0, w1;
|
|
__m256i tmp; /* used to store inputs at word boundary */
|
|
w0 = _mm256_lddqu_si256 (in + 0);
|
|
tmp = _mm256_lddqu_si256 (in + 1);
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 27));
|
|
w1 = _mm256_srli_epi32(tmp,5);
|
|
_mm256_storeu_si256(compressed + 0, w0);
|
|
tmp = _mm256_lddqu_si256 (in + 2);
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 22));
|
|
w0 = _mm256_srli_epi32(tmp,10);
|
|
_mm256_storeu_si256(compressed + 1, w1);
|
|
tmp = _mm256_lddqu_si256 (in + 3);
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 17));
|
|
w1 = _mm256_srli_epi32(tmp,15);
|
|
_mm256_storeu_si256(compressed + 2, w0);
|
|
tmp = _mm256_lddqu_si256 (in + 4);
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 12));
|
|
w0 = _mm256_srli_epi32(tmp,20);
|
|
_mm256_storeu_si256(compressed + 3, w1);
|
|
tmp = _mm256_lddqu_si256 (in + 5);
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 7));
|
|
w1 = _mm256_srli_epi32(tmp,25);
|
|
_mm256_storeu_si256(compressed + 4, w0);
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 6) , 2));
|
|
tmp = _mm256_lddqu_si256 (in + 7);
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 29));
|
|
w0 = _mm256_srli_epi32(tmp,3);
|
|
_mm256_storeu_si256(compressed + 5, w1);
|
|
tmp = _mm256_lddqu_si256 (in + 8);
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 24));
|
|
w1 = _mm256_srli_epi32(tmp,8);
|
|
_mm256_storeu_si256(compressed + 6, w0);
|
|
tmp = _mm256_lddqu_si256 (in + 9);
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 19));
|
|
w0 = _mm256_srli_epi32(tmp,13);
|
|
_mm256_storeu_si256(compressed + 7, w1);
|
|
tmp = _mm256_lddqu_si256 (in + 10);
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 14));
|
|
w1 = _mm256_srli_epi32(tmp,18);
|
|
_mm256_storeu_si256(compressed + 8, w0);
|
|
tmp = _mm256_lddqu_si256 (in + 11);
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 9));
|
|
w0 = _mm256_srli_epi32(tmp,23);
|
|
_mm256_storeu_si256(compressed + 9, w1);
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 12) , 4));
|
|
tmp = _mm256_lddqu_si256 (in + 13);
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 31));
|
|
w1 = _mm256_srli_epi32(tmp,1);
|
|
_mm256_storeu_si256(compressed + 10, w0);
|
|
tmp = _mm256_lddqu_si256 (in + 14);
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 26));
|
|
w0 = _mm256_srli_epi32(tmp,6);
|
|
_mm256_storeu_si256(compressed + 11, w1);
|
|
tmp = _mm256_lddqu_si256 (in + 15);
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 21));
|
|
w1 = _mm256_srli_epi32(tmp,11);
|
|
_mm256_storeu_si256(compressed + 12, w0);
|
|
tmp = _mm256_lddqu_si256 (in + 16);
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 16));
|
|
w0 = _mm256_srli_epi32(tmp,16);
|
|
_mm256_storeu_si256(compressed + 13, w1);
|
|
tmp = _mm256_lddqu_si256 (in + 17);
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 11));
|
|
w1 = _mm256_srli_epi32(tmp,21);
|
|
_mm256_storeu_si256(compressed + 14, w0);
|
|
tmp = _mm256_lddqu_si256 (in + 18);
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 6));
|
|
w0 = _mm256_srli_epi32(tmp,26);
|
|
_mm256_storeu_si256(compressed + 15, w1);
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 19) , 1));
|
|
tmp = _mm256_lddqu_si256 (in + 20);
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 28));
|
|
w1 = _mm256_srli_epi32(tmp,4);
|
|
_mm256_storeu_si256(compressed + 16, w0);
|
|
tmp = _mm256_lddqu_si256 (in + 21);
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 23));
|
|
w0 = _mm256_srli_epi32(tmp,9);
|
|
_mm256_storeu_si256(compressed + 17, w1);
|
|
tmp = _mm256_lddqu_si256 (in + 22);
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 18));
|
|
w1 = _mm256_srli_epi32(tmp,14);
|
|
_mm256_storeu_si256(compressed + 18, w0);
|
|
tmp = _mm256_lddqu_si256 (in + 23);
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 13));
|
|
w0 = _mm256_srli_epi32(tmp,19);
|
|
_mm256_storeu_si256(compressed + 19, w1);
|
|
tmp = _mm256_lddqu_si256 (in + 24);
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 8));
|
|
w1 = _mm256_srli_epi32(tmp,24);
|
|
_mm256_storeu_si256(compressed + 20, w0);
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 25) , 3));
|
|
tmp = _mm256_lddqu_si256 (in + 26);
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 30));
|
|
w0 = _mm256_srli_epi32(tmp,2);
|
|
_mm256_storeu_si256(compressed + 21, w1);
|
|
tmp = _mm256_lddqu_si256 (in + 27);
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 25));
|
|
w1 = _mm256_srli_epi32(tmp,7);
|
|
_mm256_storeu_si256(compressed + 22, w0);
|
|
tmp = _mm256_lddqu_si256 (in + 28);
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 20));
|
|
w0 = _mm256_srli_epi32(tmp,12);
|
|
_mm256_storeu_si256(compressed + 23, w1);
|
|
tmp = _mm256_lddqu_si256 (in + 29);
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 15));
|
|
w1 = _mm256_srli_epi32(tmp,17);
|
|
_mm256_storeu_si256(compressed + 24, w0);
|
|
tmp = _mm256_lddqu_si256 (in + 30);
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 10));
|
|
w0 = _mm256_srli_epi32(tmp,22);
|
|
_mm256_storeu_si256(compressed + 25, w1);
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 31) , 5));
|
|
_mm256_storeu_si256(compressed + 26, w0);
|
|
}
|
|
|
|
|
|
/* we are going to pack 256 28-bit values, touching 28 256-bit words, using 448 bytes */
|
|
static void avxpackblock28(const uint32_t * pin, __m256i * compressed) {
|
|
const __m256i * in = (const __m256i *) pin;
|
|
/* we are going to touch 28 256-bit words */
|
|
__m256i w0, w1;
|
|
__m256i tmp; /* used to store inputs at word boundary */
|
|
w0 = _mm256_lddqu_si256 (in + 0);
|
|
tmp = _mm256_lddqu_si256 (in + 1);
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 28));
|
|
w1 = _mm256_srli_epi32(tmp,4);
|
|
_mm256_storeu_si256(compressed + 0, w0);
|
|
tmp = _mm256_lddqu_si256 (in + 2);
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 24));
|
|
w0 = _mm256_srli_epi32(tmp,8);
|
|
_mm256_storeu_si256(compressed + 1, w1);
|
|
tmp = _mm256_lddqu_si256 (in + 3);
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 20));
|
|
w1 = _mm256_srli_epi32(tmp,12);
|
|
_mm256_storeu_si256(compressed + 2, w0);
|
|
tmp = _mm256_lddqu_si256 (in + 4);
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 16));
|
|
w0 = _mm256_srli_epi32(tmp,16);
|
|
_mm256_storeu_si256(compressed + 3, w1);
|
|
tmp = _mm256_lddqu_si256 (in + 5);
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 12));
|
|
w1 = _mm256_srli_epi32(tmp,20);
|
|
_mm256_storeu_si256(compressed + 4, w0);
|
|
tmp = _mm256_lddqu_si256 (in + 6);
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 8));
|
|
w0 = _mm256_srli_epi32(tmp,24);
|
|
_mm256_storeu_si256(compressed + 5, w1);
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 7) , 4));
|
|
_mm256_storeu_si256(compressed + 6, w0);
|
|
w1 = _mm256_lddqu_si256 (in + 8);
|
|
tmp = _mm256_lddqu_si256 (in + 9);
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 28));
|
|
w0 = _mm256_srli_epi32(tmp,4);
|
|
_mm256_storeu_si256(compressed + 7, w1);
|
|
tmp = _mm256_lddqu_si256 (in + 10);
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 24));
|
|
w1 = _mm256_srli_epi32(tmp,8);
|
|
_mm256_storeu_si256(compressed + 8, w0);
|
|
tmp = _mm256_lddqu_si256 (in + 11);
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 20));
|
|
w0 = _mm256_srli_epi32(tmp,12);
|
|
_mm256_storeu_si256(compressed + 9, w1);
|
|
tmp = _mm256_lddqu_si256 (in + 12);
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 16));
|
|
w1 = _mm256_srli_epi32(tmp,16);
|
|
_mm256_storeu_si256(compressed + 10, w0);
|
|
tmp = _mm256_lddqu_si256 (in + 13);
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 12));
|
|
w0 = _mm256_srli_epi32(tmp,20);
|
|
_mm256_storeu_si256(compressed + 11, w1);
|
|
tmp = _mm256_lddqu_si256 (in + 14);
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 8));
|
|
w1 = _mm256_srli_epi32(tmp,24);
|
|
_mm256_storeu_si256(compressed + 12, w0);
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 15) , 4));
|
|
_mm256_storeu_si256(compressed + 13, w1);
|
|
w0 = _mm256_lddqu_si256 (in + 16);
|
|
tmp = _mm256_lddqu_si256 (in + 17);
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 28));
|
|
w1 = _mm256_srli_epi32(tmp,4);
|
|
_mm256_storeu_si256(compressed + 14, w0);
|
|
tmp = _mm256_lddqu_si256 (in + 18);
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 24));
|
|
w0 = _mm256_srli_epi32(tmp,8);
|
|
_mm256_storeu_si256(compressed + 15, w1);
|
|
tmp = _mm256_lddqu_si256 (in + 19);
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 20));
|
|
w1 = _mm256_srli_epi32(tmp,12);
|
|
_mm256_storeu_si256(compressed + 16, w0);
|
|
tmp = _mm256_lddqu_si256 (in + 20);
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 16));
|
|
w0 = _mm256_srli_epi32(tmp,16);
|
|
_mm256_storeu_si256(compressed + 17, w1);
|
|
tmp = _mm256_lddqu_si256 (in + 21);
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 12));
|
|
w1 = _mm256_srli_epi32(tmp,20);
|
|
_mm256_storeu_si256(compressed + 18, w0);
|
|
tmp = _mm256_lddqu_si256 (in + 22);
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 8));
|
|
w0 = _mm256_srli_epi32(tmp,24);
|
|
_mm256_storeu_si256(compressed + 19, w1);
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 23) , 4));
|
|
_mm256_storeu_si256(compressed + 20, w0);
|
|
w1 = _mm256_lddqu_si256 (in + 24);
|
|
tmp = _mm256_lddqu_si256 (in + 25);
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 28));
|
|
w0 = _mm256_srli_epi32(tmp,4);
|
|
_mm256_storeu_si256(compressed + 21, w1);
|
|
tmp = _mm256_lddqu_si256 (in + 26);
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 24));
|
|
w1 = _mm256_srli_epi32(tmp,8);
|
|
_mm256_storeu_si256(compressed + 22, w0);
|
|
tmp = _mm256_lddqu_si256 (in + 27);
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 20));
|
|
w0 = _mm256_srli_epi32(tmp,12);
|
|
_mm256_storeu_si256(compressed + 23, w1);
|
|
tmp = _mm256_lddqu_si256 (in + 28);
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 16));
|
|
w1 = _mm256_srli_epi32(tmp,16);
|
|
_mm256_storeu_si256(compressed + 24, w0);
|
|
tmp = _mm256_lddqu_si256 (in + 29);
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 12));
|
|
w0 = _mm256_srli_epi32(tmp,20);
|
|
_mm256_storeu_si256(compressed + 25, w1);
|
|
tmp = _mm256_lddqu_si256 (in + 30);
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 8));
|
|
w1 = _mm256_srli_epi32(tmp,24);
|
|
_mm256_storeu_si256(compressed + 26, w0);
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 31) , 4));
|
|
_mm256_storeu_si256(compressed + 27, w1);
|
|
}
|
|
|
|
|
|
/* we are going to pack 256 29-bit values, touching 29 256-bit words, using 464 bytes */
|
|
static void avxpackblock29(const uint32_t * pin, __m256i * compressed) {
|
|
const __m256i * in = (const __m256i *) pin;
|
|
/* we are going to touch 29 256-bit words */
|
|
__m256i w0, w1;
|
|
__m256i tmp; /* used to store inputs at word boundary */
|
|
w0 = _mm256_lddqu_si256 (in + 0);
|
|
tmp = _mm256_lddqu_si256 (in + 1);
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 29));
|
|
w1 = _mm256_srli_epi32(tmp,3);
|
|
_mm256_storeu_si256(compressed + 0, w0);
|
|
tmp = _mm256_lddqu_si256 (in + 2);
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 26));
|
|
w0 = _mm256_srli_epi32(tmp,6);
|
|
_mm256_storeu_si256(compressed + 1, w1);
|
|
tmp = _mm256_lddqu_si256 (in + 3);
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 23));
|
|
w1 = _mm256_srli_epi32(tmp,9);
|
|
_mm256_storeu_si256(compressed + 2, w0);
|
|
tmp = _mm256_lddqu_si256 (in + 4);
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 20));
|
|
w0 = _mm256_srli_epi32(tmp,12);
|
|
_mm256_storeu_si256(compressed + 3, w1);
|
|
tmp = _mm256_lddqu_si256 (in + 5);
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 17));
|
|
w1 = _mm256_srli_epi32(tmp,15);
|
|
_mm256_storeu_si256(compressed + 4, w0);
|
|
tmp = _mm256_lddqu_si256 (in + 6);
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 14));
|
|
w0 = _mm256_srli_epi32(tmp,18);
|
|
_mm256_storeu_si256(compressed + 5, w1);
|
|
tmp = _mm256_lddqu_si256 (in + 7);
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 11));
|
|
w1 = _mm256_srli_epi32(tmp,21);
|
|
_mm256_storeu_si256(compressed + 6, w0);
|
|
tmp = _mm256_lddqu_si256 (in + 8);
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 8));
|
|
w0 = _mm256_srli_epi32(tmp,24);
|
|
_mm256_storeu_si256(compressed + 7, w1);
|
|
tmp = _mm256_lddqu_si256 (in + 9);
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 5));
|
|
w1 = _mm256_srli_epi32(tmp,27);
|
|
_mm256_storeu_si256(compressed + 8, w0);
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 10) , 2));
|
|
tmp = _mm256_lddqu_si256 (in + 11);
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 31));
|
|
w0 = _mm256_srli_epi32(tmp,1);
|
|
_mm256_storeu_si256(compressed + 9, w1);
|
|
tmp = _mm256_lddqu_si256 (in + 12);
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 28));
|
|
w1 = _mm256_srli_epi32(tmp,4);
|
|
_mm256_storeu_si256(compressed + 10, w0);
|
|
tmp = _mm256_lddqu_si256 (in + 13);
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 25));
|
|
w0 = _mm256_srli_epi32(tmp,7);
|
|
_mm256_storeu_si256(compressed + 11, w1);
|
|
tmp = _mm256_lddqu_si256 (in + 14);
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 22));
|
|
w1 = _mm256_srli_epi32(tmp,10);
|
|
_mm256_storeu_si256(compressed + 12, w0);
|
|
tmp = _mm256_lddqu_si256 (in + 15);
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 19));
|
|
w0 = _mm256_srli_epi32(tmp,13);
|
|
_mm256_storeu_si256(compressed + 13, w1);
|
|
tmp = _mm256_lddqu_si256 (in + 16);
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 16));
|
|
w1 = _mm256_srli_epi32(tmp,16);
|
|
_mm256_storeu_si256(compressed + 14, w0);
|
|
tmp = _mm256_lddqu_si256 (in + 17);
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 13));
|
|
w0 = _mm256_srli_epi32(tmp,19);
|
|
_mm256_storeu_si256(compressed + 15, w1);
|
|
tmp = _mm256_lddqu_si256 (in + 18);
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 10));
|
|
w1 = _mm256_srli_epi32(tmp,22);
|
|
_mm256_storeu_si256(compressed + 16, w0);
|
|
tmp = _mm256_lddqu_si256 (in + 19);
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 7));
|
|
w0 = _mm256_srli_epi32(tmp,25);
|
|
_mm256_storeu_si256(compressed + 17, w1);
|
|
tmp = _mm256_lddqu_si256 (in + 20);
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 4));
|
|
w1 = _mm256_srli_epi32(tmp,28);
|
|
_mm256_storeu_si256(compressed + 18, w0);
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 21) , 1));
|
|
tmp = _mm256_lddqu_si256 (in + 22);
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 30));
|
|
w0 = _mm256_srli_epi32(tmp,2);
|
|
_mm256_storeu_si256(compressed + 19, w1);
|
|
tmp = _mm256_lddqu_si256 (in + 23);
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 27));
|
|
w1 = _mm256_srli_epi32(tmp,5);
|
|
_mm256_storeu_si256(compressed + 20, w0);
|
|
tmp = _mm256_lddqu_si256 (in + 24);
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 24));
|
|
w0 = _mm256_srli_epi32(tmp,8);
|
|
_mm256_storeu_si256(compressed + 21, w1);
|
|
tmp = _mm256_lddqu_si256 (in + 25);
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 21));
|
|
w1 = _mm256_srli_epi32(tmp,11);
|
|
_mm256_storeu_si256(compressed + 22, w0);
|
|
tmp = _mm256_lddqu_si256 (in + 26);
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 18));
|
|
w0 = _mm256_srli_epi32(tmp,14);
|
|
_mm256_storeu_si256(compressed + 23, w1);
|
|
tmp = _mm256_lddqu_si256 (in + 27);
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 15));
|
|
w1 = _mm256_srli_epi32(tmp,17);
|
|
_mm256_storeu_si256(compressed + 24, w0);
|
|
tmp = _mm256_lddqu_si256 (in + 28);
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 12));
|
|
w0 = _mm256_srli_epi32(tmp,20);
|
|
_mm256_storeu_si256(compressed + 25, w1);
|
|
tmp = _mm256_lddqu_si256 (in + 29);
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 9));
|
|
w1 = _mm256_srli_epi32(tmp,23);
|
|
_mm256_storeu_si256(compressed + 26, w0);
|
|
tmp = _mm256_lddqu_si256 (in + 30);
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 6));
|
|
w0 = _mm256_srli_epi32(tmp,26);
|
|
_mm256_storeu_si256(compressed + 27, w1);
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 31) , 3));
|
|
_mm256_storeu_si256(compressed + 28, w0);
|
|
}
|
|
|
|
|
|
/* we are going to pack 256 30-bit values, touching 30 256-bit words, using 480 bytes */
|
|
static void avxpackblock30(const uint32_t * pin, __m256i * compressed) {
|
|
const __m256i * in = (const __m256i *) pin;
|
|
/* we are going to touch 30 256-bit words */
|
|
__m256i w0, w1;
|
|
__m256i tmp; /* used to store inputs at word boundary */
|
|
w0 = _mm256_lddqu_si256 (in + 0);
|
|
tmp = _mm256_lddqu_si256 (in + 1);
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 30));
|
|
w1 = _mm256_srli_epi32(tmp,2);
|
|
_mm256_storeu_si256(compressed + 0, w0);
|
|
tmp = _mm256_lddqu_si256 (in + 2);
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 28));
|
|
w0 = _mm256_srli_epi32(tmp,4);
|
|
_mm256_storeu_si256(compressed + 1, w1);
|
|
tmp = _mm256_lddqu_si256 (in + 3);
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 26));
|
|
w1 = _mm256_srli_epi32(tmp,6);
|
|
_mm256_storeu_si256(compressed + 2, w0);
|
|
tmp = _mm256_lddqu_si256 (in + 4);
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 24));
|
|
w0 = _mm256_srli_epi32(tmp,8);
|
|
_mm256_storeu_si256(compressed + 3, w1);
|
|
tmp = _mm256_lddqu_si256 (in + 5);
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 22));
|
|
w1 = _mm256_srli_epi32(tmp,10);
|
|
_mm256_storeu_si256(compressed + 4, w0);
|
|
tmp = _mm256_lddqu_si256 (in + 6);
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 20));
|
|
w0 = _mm256_srli_epi32(tmp,12);
|
|
_mm256_storeu_si256(compressed + 5, w1);
|
|
tmp = _mm256_lddqu_si256 (in + 7);
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 18));
|
|
w1 = _mm256_srli_epi32(tmp,14);
|
|
_mm256_storeu_si256(compressed + 6, w0);
|
|
tmp = _mm256_lddqu_si256 (in + 8);
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 16));
|
|
w0 = _mm256_srli_epi32(tmp,16);
|
|
_mm256_storeu_si256(compressed + 7, w1);
|
|
tmp = _mm256_lddqu_si256 (in + 9);
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 14));
|
|
w1 = _mm256_srli_epi32(tmp,18);
|
|
_mm256_storeu_si256(compressed + 8, w0);
|
|
tmp = _mm256_lddqu_si256 (in + 10);
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 12));
|
|
w0 = _mm256_srli_epi32(tmp,20);
|
|
_mm256_storeu_si256(compressed + 9, w1);
|
|
tmp = _mm256_lddqu_si256 (in + 11);
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 10));
|
|
w1 = _mm256_srli_epi32(tmp,22);
|
|
_mm256_storeu_si256(compressed + 10, w0);
|
|
tmp = _mm256_lddqu_si256 (in + 12);
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 8));
|
|
w0 = _mm256_srli_epi32(tmp,24);
|
|
_mm256_storeu_si256(compressed + 11, w1);
|
|
tmp = _mm256_lddqu_si256 (in + 13);
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 6));
|
|
w1 = _mm256_srli_epi32(tmp,26);
|
|
_mm256_storeu_si256(compressed + 12, w0);
|
|
tmp = _mm256_lddqu_si256 (in + 14);
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 4));
|
|
w0 = _mm256_srli_epi32(tmp,28);
|
|
_mm256_storeu_si256(compressed + 13, w1);
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 15) , 2));
|
|
_mm256_storeu_si256(compressed + 14, w0);
|
|
w1 = _mm256_lddqu_si256 (in + 16);
|
|
tmp = _mm256_lddqu_si256 (in + 17);
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 30));
|
|
w0 = _mm256_srli_epi32(tmp,2);
|
|
_mm256_storeu_si256(compressed + 15, w1);
|
|
tmp = _mm256_lddqu_si256 (in + 18);
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 28));
|
|
w1 = _mm256_srli_epi32(tmp,4);
|
|
_mm256_storeu_si256(compressed + 16, w0);
|
|
tmp = _mm256_lddqu_si256 (in + 19);
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 26));
|
|
w0 = _mm256_srli_epi32(tmp,6);
|
|
_mm256_storeu_si256(compressed + 17, w1);
|
|
tmp = _mm256_lddqu_si256 (in + 20);
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 24));
|
|
w1 = _mm256_srli_epi32(tmp,8);
|
|
_mm256_storeu_si256(compressed + 18, w0);
|
|
tmp = _mm256_lddqu_si256 (in + 21);
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 22));
|
|
w0 = _mm256_srli_epi32(tmp,10);
|
|
_mm256_storeu_si256(compressed + 19, w1);
|
|
tmp = _mm256_lddqu_si256 (in + 22);
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 20));
|
|
w1 = _mm256_srli_epi32(tmp,12);
|
|
_mm256_storeu_si256(compressed + 20, w0);
|
|
tmp = _mm256_lddqu_si256 (in + 23);
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 18));
|
|
w0 = _mm256_srli_epi32(tmp,14);
|
|
_mm256_storeu_si256(compressed + 21, w1);
|
|
tmp = _mm256_lddqu_si256 (in + 24);
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 16));
|
|
w1 = _mm256_srli_epi32(tmp,16);
|
|
_mm256_storeu_si256(compressed + 22, w0);
|
|
tmp = _mm256_lddqu_si256 (in + 25);
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 14));
|
|
w0 = _mm256_srli_epi32(tmp,18);
|
|
_mm256_storeu_si256(compressed + 23, w1);
|
|
tmp = _mm256_lddqu_si256 (in + 26);
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 12));
|
|
w1 = _mm256_srli_epi32(tmp,20);
|
|
_mm256_storeu_si256(compressed + 24, w0);
|
|
tmp = _mm256_lddqu_si256 (in + 27);
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 10));
|
|
w0 = _mm256_srli_epi32(tmp,22);
|
|
_mm256_storeu_si256(compressed + 25, w1);
|
|
tmp = _mm256_lddqu_si256 (in + 28);
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 8));
|
|
w1 = _mm256_srli_epi32(tmp,24);
|
|
_mm256_storeu_si256(compressed + 26, w0);
|
|
tmp = _mm256_lddqu_si256 (in + 29);
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 6));
|
|
w0 = _mm256_srli_epi32(tmp,26);
|
|
_mm256_storeu_si256(compressed + 27, w1);
|
|
tmp = _mm256_lddqu_si256 (in + 30);
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 4));
|
|
w1 = _mm256_srli_epi32(tmp,28);
|
|
_mm256_storeu_si256(compressed + 28, w0);
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 31) , 2));
|
|
_mm256_storeu_si256(compressed + 29, w1);
|
|
}
|
|
|
|
|
|
/* we are going to pack 256 31-bit values, touching 31 256-bit words, using 496 bytes */
|
|
static void avxpackblock31(const uint32_t * pin, __m256i * compressed) {
|
|
const __m256i * in = (const __m256i *) pin;
|
|
/* we are going to touch 31 256-bit words */
|
|
__m256i w0, w1;
|
|
__m256i tmp; /* used to store inputs at word boundary */
|
|
w0 = _mm256_lddqu_si256 (in + 0);
|
|
tmp = _mm256_lddqu_si256 (in + 1);
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 31));
|
|
w1 = _mm256_srli_epi32(tmp,1);
|
|
_mm256_storeu_si256(compressed + 0, w0);
|
|
tmp = _mm256_lddqu_si256 (in + 2);
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 30));
|
|
w0 = _mm256_srli_epi32(tmp,2);
|
|
_mm256_storeu_si256(compressed + 1, w1);
|
|
tmp = _mm256_lddqu_si256 (in + 3);
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 29));
|
|
w1 = _mm256_srli_epi32(tmp,3);
|
|
_mm256_storeu_si256(compressed + 2, w0);
|
|
tmp = _mm256_lddqu_si256 (in + 4);
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 28));
|
|
w0 = _mm256_srli_epi32(tmp,4);
|
|
_mm256_storeu_si256(compressed + 3, w1);
|
|
tmp = _mm256_lddqu_si256 (in + 5);
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 27));
|
|
w1 = _mm256_srli_epi32(tmp,5);
|
|
_mm256_storeu_si256(compressed + 4, w0);
|
|
tmp = _mm256_lddqu_si256 (in + 6);
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 26));
|
|
w0 = _mm256_srli_epi32(tmp,6);
|
|
_mm256_storeu_si256(compressed + 5, w1);
|
|
tmp = _mm256_lddqu_si256 (in + 7);
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 25));
|
|
w1 = _mm256_srli_epi32(tmp,7);
|
|
_mm256_storeu_si256(compressed + 6, w0);
|
|
tmp = _mm256_lddqu_si256 (in + 8);
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 24));
|
|
w0 = _mm256_srli_epi32(tmp,8);
|
|
_mm256_storeu_si256(compressed + 7, w1);
|
|
tmp = _mm256_lddqu_si256 (in + 9);
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 23));
|
|
w1 = _mm256_srli_epi32(tmp,9);
|
|
_mm256_storeu_si256(compressed + 8, w0);
|
|
tmp = _mm256_lddqu_si256 (in + 10);
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 22));
|
|
w0 = _mm256_srli_epi32(tmp,10);
|
|
_mm256_storeu_si256(compressed + 9, w1);
|
|
tmp = _mm256_lddqu_si256 (in + 11);
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 21));
|
|
w1 = _mm256_srli_epi32(tmp,11);
|
|
_mm256_storeu_si256(compressed + 10, w0);
|
|
tmp = _mm256_lddqu_si256 (in + 12);
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 20));
|
|
w0 = _mm256_srli_epi32(tmp,12);
|
|
_mm256_storeu_si256(compressed + 11, w1);
|
|
tmp = _mm256_lddqu_si256 (in + 13);
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 19));
|
|
w1 = _mm256_srli_epi32(tmp,13);
|
|
_mm256_storeu_si256(compressed + 12, w0);
|
|
tmp = _mm256_lddqu_si256 (in + 14);
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 18));
|
|
w0 = _mm256_srli_epi32(tmp,14);
|
|
_mm256_storeu_si256(compressed + 13, w1);
|
|
tmp = _mm256_lddqu_si256 (in + 15);
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 17));
|
|
w1 = _mm256_srli_epi32(tmp,15);
|
|
_mm256_storeu_si256(compressed + 14, w0);
|
|
tmp = _mm256_lddqu_si256 (in + 16);
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 16));
|
|
w0 = _mm256_srli_epi32(tmp,16);
|
|
_mm256_storeu_si256(compressed + 15, w1);
|
|
tmp = _mm256_lddqu_si256 (in + 17);
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 15));
|
|
w1 = _mm256_srli_epi32(tmp,17);
|
|
_mm256_storeu_si256(compressed + 16, w0);
|
|
tmp = _mm256_lddqu_si256 (in + 18);
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 14));
|
|
w0 = _mm256_srli_epi32(tmp,18);
|
|
_mm256_storeu_si256(compressed + 17, w1);
|
|
tmp = _mm256_lddqu_si256 (in + 19);
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 13));
|
|
w1 = _mm256_srli_epi32(tmp,19);
|
|
_mm256_storeu_si256(compressed + 18, w0);
|
|
tmp = _mm256_lddqu_si256 (in + 20);
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 12));
|
|
w0 = _mm256_srli_epi32(tmp,20);
|
|
_mm256_storeu_si256(compressed + 19, w1);
|
|
tmp = _mm256_lddqu_si256 (in + 21);
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 11));
|
|
w1 = _mm256_srli_epi32(tmp,21);
|
|
_mm256_storeu_si256(compressed + 20, w0);
|
|
tmp = _mm256_lddqu_si256 (in + 22);
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 10));
|
|
w0 = _mm256_srli_epi32(tmp,22);
|
|
_mm256_storeu_si256(compressed + 21, w1);
|
|
tmp = _mm256_lddqu_si256 (in + 23);
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 9));
|
|
w1 = _mm256_srli_epi32(tmp,23);
|
|
_mm256_storeu_si256(compressed + 22, w0);
|
|
tmp = _mm256_lddqu_si256 (in + 24);
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 8));
|
|
w0 = _mm256_srli_epi32(tmp,24);
|
|
_mm256_storeu_si256(compressed + 23, w1);
|
|
tmp = _mm256_lddqu_si256 (in + 25);
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 7));
|
|
w1 = _mm256_srli_epi32(tmp,25);
|
|
_mm256_storeu_si256(compressed + 24, w0);
|
|
tmp = _mm256_lddqu_si256 (in + 26);
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 6));
|
|
w0 = _mm256_srli_epi32(tmp,26);
|
|
_mm256_storeu_si256(compressed + 25, w1);
|
|
tmp = _mm256_lddqu_si256 (in + 27);
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 5));
|
|
w1 = _mm256_srli_epi32(tmp,27);
|
|
_mm256_storeu_si256(compressed + 26, w0);
|
|
tmp = _mm256_lddqu_si256 (in + 28);
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 4));
|
|
w0 = _mm256_srli_epi32(tmp,28);
|
|
_mm256_storeu_si256(compressed + 27, w1);
|
|
tmp = _mm256_lddqu_si256 (in + 29);
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 3));
|
|
w1 = _mm256_srli_epi32(tmp,29);
|
|
_mm256_storeu_si256(compressed + 28, w0);
|
|
tmp = _mm256_lddqu_si256 (in + 30);
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 2));
|
|
w0 = _mm256_srli_epi32(tmp,30);
|
|
_mm256_storeu_si256(compressed + 29, w1);
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(_mm256_lddqu_si256 (in + 31) , 1));
|
|
_mm256_storeu_si256(compressed + 30, w0);
|
|
}
|
|
|
|
|
|
/* we are going to pack 256 32-bit values, touching 32 256-bit words, using 512 bytes */
|
|
static void avxpackblock32(const uint32_t * pin, __m256i * compressed) {
|
|
const __m256i * in = (const __m256i *) pin;
|
|
/* we are going to touch 32 256-bit words */
|
|
__m256i w0, w1;
|
|
w0 = _mm256_lddqu_si256 (in + 0);
|
|
_mm256_storeu_si256(compressed + 0, w0);
|
|
w1 = _mm256_lddqu_si256 (in + 1);
|
|
_mm256_storeu_si256(compressed + 1, w1);
|
|
w0 = _mm256_lddqu_si256 (in + 2);
|
|
_mm256_storeu_si256(compressed + 2, w0);
|
|
w1 = _mm256_lddqu_si256 (in + 3);
|
|
_mm256_storeu_si256(compressed + 3, w1);
|
|
w0 = _mm256_lddqu_si256 (in + 4);
|
|
_mm256_storeu_si256(compressed + 4, w0);
|
|
w1 = _mm256_lddqu_si256 (in + 5);
|
|
_mm256_storeu_si256(compressed + 5, w1);
|
|
w0 = _mm256_lddqu_si256 (in + 6);
|
|
_mm256_storeu_si256(compressed + 6, w0);
|
|
w1 = _mm256_lddqu_si256 (in + 7);
|
|
_mm256_storeu_si256(compressed + 7, w1);
|
|
w0 = _mm256_lddqu_si256 (in + 8);
|
|
_mm256_storeu_si256(compressed + 8, w0);
|
|
w1 = _mm256_lddqu_si256 (in + 9);
|
|
_mm256_storeu_si256(compressed + 9, w1);
|
|
w0 = _mm256_lddqu_si256 (in + 10);
|
|
_mm256_storeu_si256(compressed + 10, w0);
|
|
w1 = _mm256_lddqu_si256 (in + 11);
|
|
_mm256_storeu_si256(compressed + 11, w1);
|
|
w0 = _mm256_lddqu_si256 (in + 12);
|
|
_mm256_storeu_si256(compressed + 12, w0);
|
|
w1 = _mm256_lddqu_si256 (in + 13);
|
|
_mm256_storeu_si256(compressed + 13, w1);
|
|
w0 = _mm256_lddqu_si256 (in + 14);
|
|
_mm256_storeu_si256(compressed + 14, w0);
|
|
w1 = _mm256_lddqu_si256 (in + 15);
|
|
_mm256_storeu_si256(compressed + 15, w1);
|
|
w0 = _mm256_lddqu_si256 (in + 16);
|
|
_mm256_storeu_si256(compressed + 16, w0);
|
|
w1 = _mm256_lddqu_si256 (in + 17);
|
|
_mm256_storeu_si256(compressed + 17, w1);
|
|
w0 = _mm256_lddqu_si256 (in + 18);
|
|
_mm256_storeu_si256(compressed + 18, w0);
|
|
w1 = _mm256_lddqu_si256 (in + 19);
|
|
_mm256_storeu_si256(compressed + 19, w1);
|
|
w0 = _mm256_lddqu_si256 (in + 20);
|
|
_mm256_storeu_si256(compressed + 20, w0);
|
|
w1 = _mm256_lddqu_si256 (in + 21);
|
|
_mm256_storeu_si256(compressed + 21, w1);
|
|
w0 = _mm256_lddqu_si256 (in + 22);
|
|
_mm256_storeu_si256(compressed + 22, w0);
|
|
w1 = _mm256_lddqu_si256 (in + 23);
|
|
_mm256_storeu_si256(compressed + 23, w1);
|
|
w0 = _mm256_lddqu_si256 (in + 24);
|
|
_mm256_storeu_si256(compressed + 24, w0);
|
|
w1 = _mm256_lddqu_si256 (in + 25);
|
|
_mm256_storeu_si256(compressed + 25, w1);
|
|
w0 = _mm256_lddqu_si256 (in + 26);
|
|
_mm256_storeu_si256(compressed + 26, w0);
|
|
w1 = _mm256_lddqu_si256 (in + 27);
|
|
_mm256_storeu_si256(compressed + 27, w1);
|
|
w0 = _mm256_lddqu_si256 (in + 28);
|
|
_mm256_storeu_si256(compressed + 28, w0);
|
|
w1 = _mm256_lddqu_si256 (in + 29);
|
|
_mm256_storeu_si256(compressed + 29, w1);
|
|
w0 = _mm256_lddqu_si256 (in + 30);
|
|
_mm256_storeu_si256(compressed + 30, w0);
|
|
w1 = _mm256_lddqu_si256 (in + 31);
|
|
_mm256_storeu_si256(compressed + 31, w1);
|
|
}
|
|
|
|
|
|
static void avxpackblockmask0(const uint32_t * pin, __m256i * compressed) {
|
|
(void)compressed;
|
|
(void) pin; /* we consumed 256 32-bit integers */
|
|
}
|
|
|
|
|
|
/* we are going to pack 256 1-bit values, touching 1 256-bit words, using 16 bytes */
|
|
static void avxpackblockmask1(const uint32_t * pin, __m256i * compressed) {
|
|
/* we are going to touch 1 256-bit word */
|
|
__m256i w0;
|
|
const __m256i * in = (const __m256i *) pin;
|
|
const __m256i mask = _mm256_set1_epi32(1);
|
|
w0 = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 0) ) ;
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 1) ) , 1));
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 2) ) , 2));
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 3) ) , 3));
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 4) ) , 4));
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 5) ) , 5));
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 6) ) , 6));
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 7) ) , 7));
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 8) ) , 8));
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 9) ) , 9));
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 10) ) , 10));
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 11) ) , 11));
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 12) ) , 12));
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 13) ) , 13));
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 14) ) , 14));
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 15) ) , 15));
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 16) ) , 16));
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 17) ) , 17));
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 18) ) , 18));
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 19) ) , 19));
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 20) ) , 20));
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 21) ) , 21));
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 22) ) , 22));
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 23) ) , 23));
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 24) ) , 24));
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 25) ) , 25));
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 26) ) , 26));
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 27) ) , 27));
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 28) ) , 28));
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 29) ) , 29));
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 30) ) , 30));
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 31) ) , 31));
|
|
_mm256_storeu_si256(compressed + 0, w0);
|
|
}
|
|
|
|
|
|
/* we are going to pack 256 2-bit values, touching 2 256-bit words, using 32 bytes */
|
|
static void avxpackblockmask2(const uint32_t * pin, __m256i * compressed) {
|
|
/* we are going to touch 2 256-bit words */
|
|
__m256i w0, w1;
|
|
const __m256i * in = (const __m256i *) pin;
|
|
const __m256i mask = _mm256_set1_epi32(3);
|
|
w0 = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 0) ) ;
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 1) ) , 2));
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 2) ) , 4));
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 3) ) , 6));
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 4) ) , 8));
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 5) ) , 10));
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 6) ) , 12));
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 7) ) , 14));
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 8) ) , 16));
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 9) ) , 18));
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 10) ) , 20));
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 11) ) , 22));
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 12) ) , 24));
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 13) ) , 26));
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 14) ) , 28));
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 15) ) , 30));
|
|
_mm256_storeu_si256(compressed + 0, w0);
|
|
w1 = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 16) ) ;
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 17) ) , 2));
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 18) ) , 4));
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 19) ) , 6));
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 20) ) , 8));
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 21) ) , 10));
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 22) ) , 12));
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 23) ) , 14));
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 24) ) , 16));
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 25) ) , 18));
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 26) ) , 20));
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 27) ) , 22));
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 28) ) , 24));
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 29) ) , 26));
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 30) ) , 28));
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 31) ) , 30));
|
|
_mm256_storeu_si256(compressed + 1, w1);
|
|
}
|
|
|
|
|
|
/* we are going to pack 256 3-bit values, touching 3 256-bit words, using 48 bytes */
|
|
static void avxpackblockmask3(const uint32_t * pin, __m256i * compressed) {
|
|
/* we are going to touch 3 256-bit words */
|
|
__m256i w0, w1;
|
|
const __m256i * in = (const __m256i *) pin;
|
|
const __m256i mask = _mm256_set1_epi32(7);
|
|
__m256i tmp; /* used to store inputs at word boundary */
|
|
w0 = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 0) ) ;
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 1) ) , 3));
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 2) ) , 6));
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 3) ) , 9));
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 4) ) , 12));
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 5) ) , 15));
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 6) ) , 18));
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 7) ) , 21));
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 8) ) , 24));
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 9) ) , 27));
|
|
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 10) ) ;
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 30));
|
|
w1 = _mm256_srli_epi32(tmp,2);
|
|
_mm256_storeu_si256(compressed + 0, w0);
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 11) ) , 1));
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 12) ) , 4));
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 13) ) , 7));
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 14) ) , 10));
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 15) ) , 13));
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 16) ) , 16));
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 17) ) , 19));
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 18) ) , 22));
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 19) ) , 25));
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 20) ) , 28));
|
|
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 21) ) ;
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 31));
|
|
w0 = _mm256_srli_epi32(tmp,1);
|
|
_mm256_storeu_si256(compressed + 1, w1);
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 22) ) , 2));
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 23) ) , 5));
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 24) ) , 8));
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 25) ) , 11));
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 26) ) , 14));
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 27) ) , 17));
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 28) ) , 20));
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 29) ) , 23));
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 30) ) , 26));
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 31) ) , 29));
|
|
_mm256_storeu_si256(compressed + 2, w0);
|
|
}
|
|
|
|
|
|
/* we are going to pack 256 4-bit values, touching 4 256-bit words, using 64 bytes */
|
|
static void avxpackblockmask4(const uint32_t * pin, __m256i * compressed) {
|
|
/* we are going to touch 4 256-bit words */
|
|
__m256i w0, w1;
|
|
const __m256i * in = (const __m256i *) pin;
|
|
const __m256i mask = _mm256_set1_epi32(15);
|
|
w0 = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 0) ) ;
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 1) ) , 4));
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 2) ) , 8));
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 3) ) , 12));
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 4) ) , 16));
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 5) ) , 20));
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 6) ) , 24));
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 7) ) , 28));
|
|
_mm256_storeu_si256(compressed + 0, w0);
|
|
w1 = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 8) ) ;
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 9) ) , 4));
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 10) ) , 8));
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 11) ) , 12));
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 12) ) , 16));
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 13) ) , 20));
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 14) ) , 24));
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 15) ) , 28));
|
|
_mm256_storeu_si256(compressed + 1, w1);
|
|
w0 = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 16) ) ;
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 17) ) , 4));
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 18) ) , 8));
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 19) ) , 12));
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 20) ) , 16));
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 21) ) , 20));
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 22) ) , 24));
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 23) ) , 28));
|
|
_mm256_storeu_si256(compressed + 2, w0);
|
|
w1 = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 24) ) ;
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 25) ) , 4));
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 26) ) , 8));
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 27) ) , 12));
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 28) ) , 16));
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 29) ) , 20));
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 30) ) , 24));
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 31) ) , 28));
|
|
_mm256_storeu_si256(compressed + 3, w1);
|
|
}
|
|
|
|
|
|
/* we are going to pack 256 5-bit values, touching 5 256-bit words, using 80 bytes */
|
|
static void avxpackblockmask5(const uint32_t * pin, __m256i * compressed) {
|
|
/* we are going to touch 5 256-bit words */
|
|
__m256i w0, w1;
|
|
const __m256i * in = (const __m256i *) pin;
|
|
const __m256i mask = _mm256_set1_epi32(31);
|
|
__m256i tmp; /* used to store inputs at word boundary */
|
|
w0 = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 0) ) ;
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 1) ) , 5));
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 2) ) , 10));
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 3) ) , 15));
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 4) ) , 20));
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 5) ) , 25));
|
|
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 6) ) ;
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 30));
|
|
w1 = _mm256_srli_epi32(tmp,2);
|
|
_mm256_storeu_si256(compressed + 0, w0);
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 7) ) , 3));
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 8) ) , 8));
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 9) ) , 13));
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 10) ) , 18));
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 11) ) , 23));
|
|
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 12) ) ;
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 28));
|
|
w0 = _mm256_srli_epi32(tmp,4);
|
|
_mm256_storeu_si256(compressed + 1, w1);
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 13) ) , 1));
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 14) ) , 6));
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 15) ) , 11));
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 16) ) , 16));
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 17) ) , 21));
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 18) ) , 26));
|
|
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 19) ) ;
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 31));
|
|
w1 = _mm256_srli_epi32(tmp,1);
|
|
_mm256_storeu_si256(compressed + 2, w0);
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 20) ) , 4));
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 21) ) , 9));
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 22) ) , 14));
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 23) ) , 19));
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 24) ) , 24));
|
|
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 25) ) ;
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 29));
|
|
w0 = _mm256_srli_epi32(tmp,3);
|
|
_mm256_storeu_si256(compressed + 3, w1);
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 26) ) , 2));
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 27) ) , 7));
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 28) ) , 12));
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 29) ) , 17));
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 30) ) , 22));
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 31) ) , 27));
|
|
_mm256_storeu_si256(compressed + 4, w0);
|
|
}
|
|
|
|
|
|
/* we are going to pack 256 6-bit values, touching 6 256-bit words, using 96 bytes */
|
|
static void avxpackblockmask6(const uint32_t * pin, __m256i * compressed) {
|
|
/* we are going to touch 6 256-bit words */
|
|
__m256i w0, w1;
|
|
const __m256i * in = (const __m256i *) pin;
|
|
const __m256i mask = _mm256_set1_epi32(63);
|
|
__m256i tmp; /* used to store inputs at word boundary */
|
|
w0 = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 0) ) ;
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 1) ) , 6));
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 2) ) , 12));
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 3) ) , 18));
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 4) ) , 24));
|
|
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 5) ) ;
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 30));
|
|
w1 = _mm256_srli_epi32(tmp,2);
|
|
_mm256_storeu_si256(compressed + 0, w0);
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 6) ) , 4));
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 7) ) , 10));
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 8) ) , 16));
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 9) ) , 22));
|
|
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 10) ) ;
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 28));
|
|
w0 = _mm256_srli_epi32(tmp,4);
|
|
_mm256_storeu_si256(compressed + 1, w1);
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 11) ) , 2));
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 12) ) , 8));
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 13) ) , 14));
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 14) ) , 20));
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 15) ) , 26));
|
|
_mm256_storeu_si256(compressed + 2, w0);
|
|
w1 = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 16) ) ;
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 17) ) , 6));
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 18) ) , 12));
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 19) ) , 18));
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 20) ) , 24));
|
|
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 21) ) ;
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 30));
|
|
w0 = _mm256_srli_epi32(tmp,2);
|
|
_mm256_storeu_si256(compressed + 3, w1);
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 22) ) , 4));
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 23) ) , 10));
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 24) ) , 16));
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 25) ) , 22));
|
|
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 26) ) ;
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 28));
|
|
w1 = _mm256_srli_epi32(tmp,4);
|
|
_mm256_storeu_si256(compressed + 4, w0);
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 27) ) , 2));
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 28) ) , 8));
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 29) ) , 14));
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 30) ) , 20));
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 31) ) , 26));
|
|
_mm256_storeu_si256(compressed + 5, w1);
|
|
}
|
|
|
|
|
|
/* we are going to pack 256 7-bit values, touching 7 256-bit words, using 112 bytes */
|
|
static void avxpackblockmask7(const uint32_t * pin, __m256i * compressed) {
|
|
/* we are going to touch 7 256-bit words */
|
|
__m256i w0, w1;
|
|
const __m256i * in = (const __m256i *) pin;
|
|
const __m256i mask = _mm256_set1_epi32(127);
|
|
__m256i tmp; /* used to store inputs at word boundary */
|
|
w0 = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 0) ) ;
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 1) ) , 7));
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 2) ) , 14));
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 3) ) , 21));
|
|
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 4) ) ;
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 28));
|
|
w1 = _mm256_srli_epi32(tmp,4);
|
|
_mm256_storeu_si256(compressed + 0, w0);
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 5) ) , 3));
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 6) ) , 10));
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 7) ) , 17));
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 8) ) , 24));
|
|
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 9) ) ;
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 31));
|
|
w0 = _mm256_srli_epi32(tmp,1);
|
|
_mm256_storeu_si256(compressed + 1, w1);
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 10) ) , 6));
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 11) ) , 13));
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 12) ) , 20));
|
|
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 13) ) ;
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 27));
|
|
w1 = _mm256_srli_epi32(tmp,5);
|
|
_mm256_storeu_si256(compressed + 2, w0);
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 14) ) , 2));
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 15) ) , 9));
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 16) ) , 16));
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 17) ) , 23));
|
|
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 18) ) ;
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 30));
|
|
w0 = _mm256_srli_epi32(tmp,2);
|
|
_mm256_storeu_si256(compressed + 3, w1);
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 19) ) , 5));
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 20) ) , 12));
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 21) ) , 19));
|
|
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 22) ) ;
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 26));
|
|
w1 = _mm256_srli_epi32(tmp,6);
|
|
_mm256_storeu_si256(compressed + 4, w0);
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 23) ) , 1));
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 24) ) , 8));
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 25) ) , 15));
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 26) ) , 22));
|
|
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 27) ) ;
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 29));
|
|
w0 = _mm256_srli_epi32(tmp,3);
|
|
_mm256_storeu_si256(compressed + 5, w1);
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 28) ) , 4));
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 29) ) , 11));
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 30) ) , 18));
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 31) ) , 25));
|
|
_mm256_storeu_si256(compressed + 6, w0);
|
|
}
|
|
|
|
|
|
/* we are going to pack 256 8-bit values, touching 8 256-bit words, using 128 bytes */
|
|
static void avxpackblockmask8(const uint32_t * pin, __m256i * compressed) {
|
|
/* we are going to touch 8 256-bit words */
|
|
__m256i w0, w1;
|
|
const __m256i * in = (const __m256i *) pin;
|
|
const __m256i mask = _mm256_set1_epi32(255);
|
|
w0 = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 0) ) ;
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 1) ) , 8));
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 2) ) , 16));
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 3) ) , 24));
|
|
_mm256_storeu_si256(compressed + 0, w0);
|
|
w1 = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 4) ) ;
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 5) ) , 8));
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 6) ) , 16));
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 7) ) , 24));
|
|
_mm256_storeu_si256(compressed + 1, w1);
|
|
w0 = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 8) ) ;
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 9) ) , 8));
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 10) ) , 16));
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 11) ) , 24));
|
|
_mm256_storeu_si256(compressed + 2, w0);
|
|
w1 = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 12) ) ;
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 13) ) , 8));
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 14) ) , 16));
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 15) ) , 24));
|
|
_mm256_storeu_si256(compressed + 3, w1);
|
|
w0 = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 16) ) ;
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 17) ) , 8));
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 18) ) , 16));
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 19) ) , 24));
|
|
_mm256_storeu_si256(compressed + 4, w0);
|
|
w1 = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 20) ) ;
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 21) ) , 8));
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 22) ) , 16));
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 23) ) , 24));
|
|
_mm256_storeu_si256(compressed + 5, w1);
|
|
w0 = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 24) ) ;
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 25) ) , 8));
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 26) ) , 16));
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 27) ) , 24));
|
|
_mm256_storeu_si256(compressed + 6, w0);
|
|
w1 = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 28) ) ;
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 29) ) , 8));
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 30) ) , 16));
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 31) ) , 24));
|
|
_mm256_storeu_si256(compressed + 7, w1);
|
|
}
|
|
|
|
|
|
/* we are going to pack 256 9-bit values, touching 9 256-bit words, using 144 bytes */
|
|
static void avxpackblockmask9(const uint32_t * pin, __m256i * compressed) {
|
|
/* we are going to touch 9 256-bit words */
|
|
__m256i w0, w1;
|
|
const __m256i * in = (const __m256i *) pin;
|
|
const __m256i mask = _mm256_set1_epi32(511);
|
|
__m256i tmp; /* used to store inputs at word boundary */
|
|
w0 = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 0) ) ;
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 1) ) , 9));
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 2) ) , 18));
|
|
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 3) ) ;
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 27));
|
|
w1 = _mm256_srli_epi32(tmp,5);
|
|
_mm256_storeu_si256(compressed + 0, w0);
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 4) ) , 4));
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 5) ) , 13));
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 6) ) , 22));
|
|
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 7) ) ;
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 31));
|
|
w0 = _mm256_srli_epi32(tmp,1);
|
|
_mm256_storeu_si256(compressed + 1, w1);
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 8) ) , 8));
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 9) ) , 17));
|
|
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 10) ) ;
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 26));
|
|
w1 = _mm256_srli_epi32(tmp,6);
|
|
_mm256_storeu_si256(compressed + 2, w0);
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 11) ) , 3));
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 12) ) , 12));
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 13) ) , 21));
|
|
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 14) ) ;
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 30));
|
|
w0 = _mm256_srli_epi32(tmp,2);
|
|
_mm256_storeu_si256(compressed + 3, w1);
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 15) ) , 7));
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 16) ) , 16));
|
|
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 17) ) ;
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 25));
|
|
w1 = _mm256_srli_epi32(tmp,7);
|
|
_mm256_storeu_si256(compressed + 4, w0);
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 18) ) , 2));
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 19) ) , 11));
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 20) ) , 20));
|
|
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 21) ) ;
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 29));
|
|
w0 = _mm256_srli_epi32(tmp,3);
|
|
_mm256_storeu_si256(compressed + 5, w1);
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 22) ) , 6));
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 23) ) , 15));
|
|
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 24) ) ;
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 24));
|
|
w1 = _mm256_srli_epi32(tmp,8);
|
|
_mm256_storeu_si256(compressed + 6, w0);
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 25) ) , 1));
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 26) ) , 10));
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 27) ) , 19));
|
|
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 28) ) ;
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 28));
|
|
w0 = _mm256_srli_epi32(tmp,4);
|
|
_mm256_storeu_si256(compressed + 7, w1);
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 29) ) , 5));
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 30) ) , 14));
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 31) ) , 23));
|
|
_mm256_storeu_si256(compressed + 8, w0);
|
|
}
|
|
|
|
|
|
/* we are going to pack 256 10-bit values, touching 10 256-bit words, using 160 bytes */
|
|
static void avxpackblockmask10(const uint32_t * pin, __m256i * compressed) {
|
|
/* we are going to touch 10 256-bit words */
|
|
__m256i w0, w1;
|
|
const __m256i * in = (const __m256i *) pin;
|
|
const __m256i mask = _mm256_set1_epi32(1023);
|
|
__m256i tmp; /* used to store inputs at word boundary */
|
|
w0 = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 0) ) ;
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 1) ) , 10));
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 2) ) , 20));
|
|
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 3) ) ;
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 30));
|
|
w1 = _mm256_srli_epi32(tmp,2);
|
|
_mm256_storeu_si256(compressed + 0, w0);
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 4) ) , 8));
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 5) ) , 18));
|
|
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 6) ) ;
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 28));
|
|
w0 = _mm256_srli_epi32(tmp,4);
|
|
_mm256_storeu_si256(compressed + 1, w1);
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 7) ) , 6));
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 8) ) , 16));
|
|
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 9) ) ;
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 26));
|
|
w1 = _mm256_srli_epi32(tmp,6);
|
|
_mm256_storeu_si256(compressed + 2, w0);
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 10) ) , 4));
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 11) ) , 14));
|
|
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 12) ) ;
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 24));
|
|
w0 = _mm256_srli_epi32(tmp,8);
|
|
_mm256_storeu_si256(compressed + 3, w1);
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 13) ) , 2));
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 14) ) , 12));
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 15) ) , 22));
|
|
_mm256_storeu_si256(compressed + 4, w0);
|
|
w1 = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 16) ) ;
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 17) ) , 10));
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 18) ) , 20));
|
|
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 19) ) ;
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 30));
|
|
w0 = _mm256_srli_epi32(tmp,2);
|
|
_mm256_storeu_si256(compressed + 5, w1);
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 20) ) , 8));
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 21) ) , 18));
|
|
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 22) ) ;
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 28));
|
|
w1 = _mm256_srli_epi32(tmp,4);
|
|
_mm256_storeu_si256(compressed + 6, w0);
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 23) ) , 6));
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 24) ) , 16));
|
|
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 25) ) ;
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 26));
|
|
w0 = _mm256_srli_epi32(tmp,6);
|
|
_mm256_storeu_si256(compressed + 7, w1);
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 26) ) , 4));
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 27) ) , 14));
|
|
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 28) ) ;
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 24));
|
|
w1 = _mm256_srli_epi32(tmp,8);
|
|
_mm256_storeu_si256(compressed + 8, w0);
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 29) ) , 2));
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 30) ) , 12));
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 31) ) , 22));
|
|
_mm256_storeu_si256(compressed + 9, w1);
|
|
}
|
|
|
|
|
|
/* we are going to pack 256 11-bit values, touching 11 256-bit words, using 176 bytes */
|
|
static void avxpackblockmask11(const uint32_t * pin, __m256i * compressed) {
|
|
/* we are going to touch 11 256-bit words */
|
|
__m256i w0, w1;
|
|
const __m256i * in = (const __m256i *) pin;
|
|
const __m256i mask = _mm256_set1_epi32(2047);
|
|
__m256i tmp; /* used to store inputs at word boundary */
|
|
w0 = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 0) ) ;
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 1) ) , 11));
|
|
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 2) ) ;
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 22));
|
|
w1 = _mm256_srli_epi32(tmp,10);
|
|
_mm256_storeu_si256(compressed + 0, w0);
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 3) ) , 1));
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 4) ) , 12));
|
|
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 5) ) ;
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 23));
|
|
w0 = _mm256_srli_epi32(tmp,9);
|
|
_mm256_storeu_si256(compressed + 1, w1);
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 6) ) , 2));
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 7) ) , 13));
|
|
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 8) ) ;
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 24));
|
|
w1 = _mm256_srli_epi32(tmp,8);
|
|
_mm256_storeu_si256(compressed + 2, w0);
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 9) ) , 3));
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 10) ) , 14));
|
|
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 11) ) ;
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 25));
|
|
w0 = _mm256_srli_epi32(tmp,7);
|
|
_mm256_storeu_si256(compressed + 3, w1);
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 12) ) , 4));
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 13) ) , 15));
|
|
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 14) ) ;
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 26));
|
|
w1 = _mm256_srli_epi32(tmp,6);
|
|
_mm256_storeu_si256(compressed + 4, w0);
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 15) ) , 5));
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 16) ) , 16));
|
|
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 17) ) ;
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 27));
|
|
w0 = _mm256_srli_epi32(tmp,5);
|
|
_mm256_storeu_si256(compressed + 5, w1);
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 18) ) , 6));
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 19) ) , 17));
|
|
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 20) ) ;
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 28));
|
|
w1 = _mm256_srli_epi32(tmp,4);
|
|
_mm256_storeu_si256(compressed + 6, w0);
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 21) ) , 7));
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 22) ) , 18));
|
|
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 23) ) ;
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 29));
|
|
w0 = _mm256_srli_epi32(tmp,3);
|
|
_mm256_storeu_si256(compressed + 7, w1);
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 24) ) , 8));
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 25) ) , 19));
|
|
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 26) ) ;
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 30));
|
|
w1 = _mm256_srli_epi32(tmp,2);
|
|
_mm256_storeu_si256(compressed + 8, w0);
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 27) ) , 9));
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 28) ) , 20));
|
|
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 29) ) ;
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 31));
|
|
w0 = _mm256_srli_epi32(tmp,1);
|
|
_mm256_storeu_si256(compressed + 9, w1);
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 30) ) , 10));
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 31) ) , 21));
|
|
_mm256_storeu_si256(compressed + 10, w0);
|
|
}
|
|
|
|
|
|
/* we are going to pack 256 12-bit values, touching 12 256-bit words, using 192 bytes */
|
|
static void avxpackblockmask12(const uint32_t * pin, __m256i * compressed) {
|
|
/* we are going to touch 12 256-bit words */
|
|
__m256i w0, w1;
|
|
const __m256i * in = (const __m256i *) pin;
|
|
const __m256i mask = _mm256_set1_epi32(4095);
|
|
__m256i tmp; /* used to store inputs at word boundary */
|
|
w0 = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 0) ) ;
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 1) ) , 12));
|
|
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 2) ) ;
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 24));
|
|
w1 = _mm256_srli_epi32(tmp,8);
|
|
_mm256_storeu_si256(compressed + 0, w0);
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 3) ) , 4));
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 4) ) , 16));
|
|
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 5) ) ;
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 28));
|
|
w0 = _mm256_srli_epi32(tmp,4);
|
|
_mm256_storeu_si256(compressed + 1, w1);
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 6) ) , 8));
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 7) ) , 20));
|
|
_mm256_storeu_si256(compressed + 2, w0);
|
|
w1 = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 8) ) ;
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 9) ) , 12));
|
|
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 10) ) ;
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 24));
|
|
w0 = _mm256_srli_epi32(tmp,8);
|
|
_mm256_storeu_si256(compressed + 3, w1);
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 11) ) , 4));
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 12) ) , 16));
|
|
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 13) ) ;
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 28));
|
|
w1 = _mm256_srli_epi32(tmp,4);
|
|
_mm256_storeu_si256(compressed + 4, w0);
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 14) ) , 8));
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 15) ) , 20));
|
|
_mm256_storeu_si256(compressed + 5, w1);
|
|
w0 = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 16) ) ;
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 17) ) , 12));
|
|
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 18) ) ;
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 24));
|
|
w1 = _mm256_srli_epi32(tmp,8);
|
|
_mm256_storeu_si256(compressed + 6, w0);
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 19) ) , 4));
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 20) ) , 16));
|
|
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 21) ) ;
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 28));
|
|
w0 = _mm256_srli_epi32(tmp,4);
|
|
_mm256_storeu_si256(compressed + 7, w1);
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 22) ) , 8));
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 23) ) , 20));
|
|
_mm256_storeu_si256(compressed + 8, w0);
|
|
w1 = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 24) ) ;
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 25) ) , 12));
|
|
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 26) ) ;
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 24));
|
|
w0 = _mm256_srli_epi32(tmp,8);
|
|
_mm256_storeu_si256(compressed + 9, w1);
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 27) ) , 4));
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 28) ) , 16));
|
|
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 29) ) ;
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 28));
|
|
w1 = _mm256_srli_epi32(tmp,4);
|
|
_mm256_storeu_si256(compressed + 10, w0);
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 30) ) , 8));
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 31) ) , 20));
|
|
_mm256_storeu_si256(compressed + 11, w1);
|
|
}
|
|
|
|
|
|
/* we are going to pack 256 13-bit values, touching 13 256-bit words, using 208 bytes */
|
|
static void avxpackblockmask13(const uint32_t * pin, __m256i * compressed) {
|
|
/* we are going to touch 13 256-bit words */
|
|
__m256i w0, w1;
|
|
const __m256i * in = (const __m256i *) pin;
|
|
const __m256i mask = _mm256_set1_epi32(8191);
|
|
__m256i tmp; /* used to store inputs at word boundary */
|
|
w0 = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 0) ) ;
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 1) ) , 13));
|
|
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 2) ) ;
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 26));
|
|
w1 = _mm256_srli_epi32(tmp,6);
|
|
_mm256_storeu_si256(compressed + 0, w0);
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 3) ) , 7));
|
|
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 4) ) ;
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 20));
|
|
w0 = _mm256_srli_epi32(tmp,12);
|
|
_mm256_storeu_si256(compressed + 1, w1);
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 5) ) , 1));
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 6) ) , 14));
|
|
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 7) ) ;
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 27));
|
|
w1 = _mm256_srli_epi32(tmp,5);
|
|
_mm256_storeu_si256(compressed + 2, w0);
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 8) ) , 8));
|
|
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 9) ) ;
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 21));
|
|
w0 = _mm256_srli_epi32(tmp,11);
|
|
_mm256_storeu_si256(compressed + 3, w1);
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 10) ) , 2));
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 11) ) , 15));
|
|
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 12) ) ;
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 28));
|
|
w1 = _mm256_srli_epi32(tmp,4);
|
|
_mm256_storeu_si256(compressed + 4, w0);
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 13) ) , 9));
|
|
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 14) ) ;
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 22));
|
|
w0 = _mm256_srli_epi32(tmp,10);
|
|
_mm256_storeu_si256(compressed + 5, w1);
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 15) ) , 3));
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 16) ) , 16));
|
|
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 17) ) ;
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 29));
|
|
w1 = _mm256_srli_epi32(tmp,3);
|
|
_mm256_storeu_si256(compressed + 6, w0);
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 18) ) , 10));
|
|
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 19) ) ;
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 23));
|
|
w0 = _mm256_srli_epi32(tmp,9);
|
|
_mm256_storeu_si256(compressed + 7, w1);
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 20) ) , 4));
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 21) ) , 17));
|
|
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 22) ) ;
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 30));
|
|
w1 = _mm256_srli_epi32(tmp,2);
|
|
_mm256_storeu_si256(compressed + 8, w0);
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 23) ) , 11));
|
|
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 24) ) ;
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 24));
|
|
w0 = _mm256_srli_epi32(tmp,8);
|
|
_mm256_storeu_si256(compressed + 9, w1);
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 25) ) , 5));
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 26) ) , 18));
|
|
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 27) ) ;
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 31));
|
|
w1 = _mm256_srli_epi32(tmp,1);
|
|
_mm256_storeu_si256(compressed + 10, w0);
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 28) ) , 12));
|
|
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 29) ) ;
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 25));
|
|
w0 = _mm256_srli_epi32(tmp,7);
|
|
_mm256_storeu_si256(compressed + 11, w1);
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 30) ) , 6));
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 31) ) , 19));
|
|
_mm256_storeu_si256(compressed + 12, w0);
|
|
}
|
|
|
|
|
|
/* we are going to pack 256 14-bit values, touching 14 256-bit words, using 224 bytes */
|
|
static void avxpackblockmask14(const uint32_t * pin, __m256i * compressed) {
|
|
/* we are going to touch 14 256-bit words */
|
|
__m256i w0, w1;
|
|
const __m256i * in = (const __m256i *) pin;
|
|
const __m256i mask = _mm256_set1_epi32(16383);
|
|
__m256i tmp; /* used to store inputs at word boundary */
|
|
w0 = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 0) ) ;
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 1) ) , 14));
|
|
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 2) ) ;
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 28));
|
|
w1 = _mm256_srli_epi32(tmp,4);
|
|
_mm256_storeu_si256(compressed + 0, w0);
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 3) ) , 10));
|
|
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 4) ) ;
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 24));
|
|
w0 = _mm256_srli_epi32(tmp,8);
|
|
_mm256_storeu_si256(compressed + 1, w1);
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 5) ) , 6));
|
|
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 6) ) ;
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 20));
|
|
w1 = _mm256_srli_epi32(tmp,12);
|
|
_mm256_storeu_si256(compressed + 2, w0);
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 7) ) , 2));
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 8) ) , 16));
|
|
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 9) ) ;
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 30));
|
|
w0 = _mm256_srli_epi32(tmp,2);
|
|
_mm256_storeu_si256(compressed + 3, w1);
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 10) ) , 12));
|
|
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 11) ) ;
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 26));
|
|
w1 = _mm256_srli_epi32(tmp,6);
|
|
_mm256_storeu_si256(compressed + 4, w0);
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 12) ) , 8));
|
|
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 13) ) ;
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 22));
|
|
w0 = _mm256_srli_epi32(tmp,10);
|
|
_mm256_storeu_si256(compressed + 5, w1);
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 14) ) , 4));
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 15) ) , 18));
|
|
_mm256_storeu_si256(compressed + 6, w0);
|
|
w1 = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 16) ) ;
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 17) ) , 14));
|
|
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 18) ) ;
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 28));
|
|
w0 = _mm256_srli_epi32(tmp,4);
|
|
_mm256_storeu_si256(compressed + 7, w1);
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 19) ) , 10));
|
|
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 20) ) ;
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 24));
|
|
w1 = _mm256_srli_epi32(tmp,8);
|
|
_mm256_storeu_si256(compressed + 8, w0);
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 21) ) , 6));
|
|
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 22) ) ;
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 20));
|
|
w0 = _mm256_srli_epi32(tmp,12);
|
|
_mm256_storeu_si256(compressed + 9, w1);
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 23) ) , 2));
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 24) ) , 16));
|
|
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 25) ) ;
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 30));
|
|
w1 = _mm256_srli_epi32(tmp,2);
|
|
_mm256_storeu_si256(compressed + 10, w0);
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 26) ) , 12));
|
|
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 27) ) ;
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 26));
|
|
w0 = _mm256_srli_epi32(tmp,6);
|
|
_mm256_storeu_si256(compressed + 11, w1);
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 28) ) , 8));
|
|
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 29) ) ;
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 22));
|
|
w1 = _mm256_srli_epi32(tmp,10);
|
|
_mm256_storeu_si256(compressed + 12, w0);
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 30) ) , 4));
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 31) ) , 18));
|
|
_mm256_storeu_si256(compressed + 13, w1);
|
|
}
|
|
|
|
|
|
/* we are going to pack 256 15-bit values, touching 15 256-bit words, using 240 bytes */
|
|
static void avxpackblockmask15(const uint32_t * pin, __m256i * compressed) {
|
|
/* we are going to touch 15 256-bit words */
|
|
__m256i w0, w1;
|
|
const __m256i * in = (const __m256i *) pin;
|
|
const __m256i mask = _mm256_set1_epi32(32767);
|
|
__m256i tmp; /* used to store inputs at word boundary */
|
|
w0 = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 0) ) ;
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 1) ) , 15));
|
|
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 2) ) ;
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 30));
|
|
w1 = _mm256_srli_epi32(tmp,2);
|
|
_mm256_storeu_si256(compressed + 0, w0);
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 3) ) , 13));
|
|
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 4) ) ;
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 28));
|
|
w0 = _mm256_srli_epi32(tmp,4);
|
|
_mm256_storeu_si256(compressed + 1, w1);
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 5) ) , 11));
|
|
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 6) ) ;
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 26));
|
|
w1 = _mm256_srli_epi32(tmp,6);
|
|
_mm256_storeu_si256(compressed + 2, w0);
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 7) ) , 9));
|
|
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 8) ) ;
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 24));
|
|
w0 = _mm256_srli_epi32(tmp,8);
|
|
_mm256_storeu_si256(compressed + 3, w1);
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 9) ) , 7));
|
|
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 10) ) ;
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 22));
|
|
w1 = _mm256_srli_epi32(tmp,10);
|
|
_mm256_storeu_si256(compressed + 4, w0);
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 11) ) , 5));
|
|
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 12) ) ;
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 20));
|
|
w0 = _mm256_srli_epi32(tmp,12);
|
|
_mm256_storeu_si256(compressed + 5, w1);
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 13) ) , 3));
|
|
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 14) ) ;
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 18));
|
|
w1 = _mm256_srli_epi32(tmp,14);
|
|
_mm256_storeu_si256(compressed + 6, w0);
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 15) ) , 1));
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 16) ) , 16));
|
|
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 17) ) ;
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 31));
|
|
w0 = _mm256_srli_epi32(tmp,1);
|
|
_mm256_storeu_si256(compressed + 7, w1);
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 18) ) , 14));
|
|
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 19) ) ;
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 29));
|
|
w1 = _mm256_srli_epi32(tmp,3);
|
|
_mm256_storeu_si256(compressed + 8, w0);
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 20) ) , 12));
|
|
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 21) ) ;
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 27));
|
|
w0 = _mm256_srli_epi32(tmp,5);
|
|
_mm256_storeu_si256(compressed + 9, w1);
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 22) ) , 10));
|
|
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 23) ) ;
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 25));
|
|
w1 = _mm256_srli_epi32(tmp,7);
|
|
_mm256_storeu_si256(compressed + 10, w0);
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 24) ) , 8));
|
|
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 25) ) ;
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 23));
|
|
w0 = _mm256_srli_epi32(tmp,9);
|
|
_mm256_storeu_si256(compressed + 11, w1);
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 26) ) , 6));
|
|
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 27) ) ;
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 21));
|
|
w1 = _mm256_srli_epi32(tmp,11);
|
|
_mm256_storeu_si256(compressed + 12, w0);
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 28) ) , 4));
|
|
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 29) ) ;
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 19));
|
|
w0 = _mm256_srli_epi32(tmp,13);
|
|
_mm256_storeu_si256(compressed + 13, w1);
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 30) ) , 2));
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 31) ) , 17));
|
|
_mm256_storeu_si256(compressed + 14, w0);
|
|
}
|
|
|
|
|
|
/* we are going to pack 256 16-bit values, touching 16 256-bit words, using 256 bytes */
|
|
static void avxpackblockmask16(const uint32_t * pin, __m256i * compressed) {
|
|
/* we are going to touch 16 256-bit words */
|
|
__m256i w0, w1;
|
|
const __m256i * in = (const __m256i *) pin;
|
|
const __m256i mask = _mm256_set1_epi32(65535);
|
|
w0 = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 0) ) ;
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 1) ) , 16));
|
|
_mm256_storeu_si256(compressed + 0, w0);
|
|
w1 = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 2) ) ;
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 3) ) , 16));
|
|
_mm256_storeu_si256(compressed + 1, w1);
|
|
w0 = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 4) ) ;
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 5) ) , 16));
|
|
_mm256_storeu_si256(compressed + 2, w0);
|
|
w1 = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 6) ) ;
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 7) ) , 16));
|
|
_mm256_storeu_si256(compressed + 3, w1);
|
|
w0 = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 8) ) ;
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 9) ) , 16));
|
|
_mm256_storeu_si256(compressed + 4, w0);
|
|
w1 = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 10) ) ;
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 11) ) , 16));
|
|
_mm256_storeu_si256(compressed + 5, w1);
|
|
w0 = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 12) ) ;
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 13) ) , 16));
|
|
_mm256_storeu_si256(compressed + 6, w0);
|
|
w1 = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 14) ) ;
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 15) ) , 16));
|
|
_mm256_storeu_si256(compressed + 7, w1);
|
|
w0 = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 16) ) ;
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 17) ) , 16));
|
|
_mm256_storeu_si256(compressed + 8, w0);
|
|
w1 = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 18) ) ;
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 19) ) , 16));
|
|
_mm256_storeu_si256(compressed + 9, w1);
|
|
w0 = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 20) ) ;
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 21) ) , 16));
|
|
_mm256_storeu_si256(compressed + 10, w0);
|
|
w1 = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 22) ) ;
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 23) ) , 16));
|
|
_mm256_storeu_si256(compressed + 11, w1);
|
|
w0 = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 24) ) ;
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 25) ) , 16));
|
|
_mm256_storeu_si256(compressed + 12, w0);
|
|
w1 = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 26) ) ;
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 27) ) , 16));
|
|
_mm256_storeu_si256(compressed + 13, w1);
|
|
w0 = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 28) ) ;
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 29) ) , 16));
|
|
_mm256_storeu_si256(compressed + 14, w0);
|
|
w1 = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 30) ) ;
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 31) ) , 16));
|
|
_mm256_storeu_si256(compressed + 15, w1);
|
|
}
|
|
|
|
|
|
/* we are going to pack 256 17-bit values, touching 17 256-bit words, using 272 bytes */
|
|
static void avxpackblockmask17(const uint32_t * pin, __m256i * compressed) {
|
|
/* we are going to touch 17 256-bit words */
|
|
__m256i w0, w1;
|
|
const __m256i * in = (const __m256i *) pin;
|
|
const __m256i mask = _mm256_set1_epi32(131071);
|
|
__m256i tmp; /* used to store inputs at word boundary */
|
|
w0 = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 0) ) ;
|
|
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 1) ) ;
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 17));
|
|
w1 = _mm256_srli_epi32(tmp,15);
|
|
_mm256_storeu_si256(compressed + 0, w0);
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 2) ) , 2));
|
|
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 3) ) ;
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 19));
|
|
w0 = _mm256_srli_epi32(tmp,13);
|
|
_mm256_storeu_si256(compressed + 1, w1);
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 4) ) , 4));
|
|
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 5) ) ;
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 21));
|
|
w1 = _mm256_srli_epi32(tmp,11);
|
|
_mm256_storeu_si256(compressed + 2, w0);
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 6) ) , 6));
|
|
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 7) ) ;
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 23));
|
|
w0 = _mm256_srli_epi32(tmp,9);
|
|
_mm256_storeu_si256(compressed + 3, w1);
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 8) ) , 8));
|
|
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 9) ) ;
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 25));
|
|
w1 = _mm256_srli_epi32(tmp,7);
|
|
_mm256_storeu_si256(compressed + 4, w0);
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 10) ) , 10));
|
|
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 11) ) ;
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 27));
|
|
w0 = _mm256_srli_epi32(tmp,5);
|
|
_mm256_storeu_si256(compressed + 5, w1);
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 12) ) , 12));
|
|
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 13) ) ;
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 29));
|
|
w1 = _mm256_srli_epi32(tmp,3);
|
|
_mm256_storeu_si256(compressed + 6, w0);
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 14) ) , 14));
|
|
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 15) ) ;
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 31));
|
|
w0 = _mm256_srli_epi32(tmp,1);
|
|
_mm256_storeu_si256(compressed + 7, w1);
|
|
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 16) ) ;
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 16));
|
|
w1 = _mm256_srli_epi32(tmp,16);
|
|
_mm256_storeu_si256(compressed + 8, w0);
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 17) ) , 1));
|
|
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 18) ) ;
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 18));
|
|
w0 = _mm256_srli_epi32(tmp,14);
|
|
_mm256_storeu_si256(compressed + 9, w1);
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 19) ) , 3));
|
|
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 20) ) ;
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 20));
|
|
w1 = _mm256_srli_epi32(tmp,12);
|
|
_mm256_storeu_si256(compressed + 10, w0);
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 21) ) , 5));
|
|
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 22) ) ;
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 22));
|
|
w0 = _mm256_srli_epi32(tmp,10);
|
|
_mm256_storeu_si256(compressed + 11, w1);
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 23) ) , 7));
|
|
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 24) ) ;
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 24));
|
|
w1 = _mm256_srli_epi32(tmp,8);
|
|
_mm256_storeu_si256(compressed + 12, w0);
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 25) ) , 9));
|
|
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 26) ) ;
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 26));
|
|
w0 = _mm256_srli_epi32(tmp,6);
|
|
_mm256_storeu_si256(compressed + 13, w1);
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 27) ) , 11));
|
|
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 28) ) ;
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 28));
|
|
w1 = _mm256_srli_epi32(tmp,4);
|
|
_mm256_storeu_si256(compressed + 14, w0);
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 29) ) , 13));
|
|
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 30) ) ;
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 30));
|
|
w0 = _mm256_srli_epi32(tmp,2);
|
|
_mm256_storeu_si256(compressed + 15, w1);
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 31) ) , 15));
|
|
_mm256_storeu_si256(compressed + 16, w0);
|
|
}
|
|
|
|
|
|
/* we are going to pack 256 18-bit values, touching 18 256-bit words, using 288 bytes */
|
|
static void avxpackblockmask18(const uint32_t * pin, __m256i * compressed) {
|
|
/* we are going to touch 18 256-bit words */
|
|
__m256i w0, w1;
|
|
const __m256i * in = (const __m256i *) pin;
|
|
const __m256i mask = _mm256_set1_epi32(262143);
|
|
__m256i tmp; /* used to store inputs at word boundary */
|
|
w0 = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 0) ) ;
|
|
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 1) ) ;
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 18));
|
|
w1 = _mm256_srli_epi32(tmp,14);
|
|
_mm256_storeu_si256(compressed + 0, w0);
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 2) ) , 4));
|
|
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 3) ) ;
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 22));
|
|
w0 = _mm256_srli_epi32(tmp,10);
|
|
_mm256_storeu_si256(compressed + 1, w1);
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 4) ) , 8));
|
|
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 5) ) ;
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 26));
|
|
w1 = _mm256_srli_epi32(tmp,6);
|
|
_mm256_storeu_si256(compressed + 2, w0);
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 6) ) , 12));
|
|
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 7) ) ;
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 30));
|
|
w0 = _mm256_srli_epi32(tmp,2);
|
|
_mm256_storeu_si256(compressed + 3, w1);
|
|
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 8) ) ;
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 16));
|
|
w1 = _mm256_srli_epi32(tmp,16);
|
|
_mm256_storeu_si256(compressed + 4, w0);
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 9) ) , 2));
|
|
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 10) ) ;
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 20));
|
|
w0 = _mm256_srli_epi32(tmp,12);
|
|
_mm256_storeu_si256(compressed + 5, w1);
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 11) ) , 6));
|
|
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 12) ) ;
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 24));
|
|
w1 = _mm256_srli_epi32(tmp,8);
|
|
_mm256_storeu_si256(compressed + 6, w0);
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 13) ) , 10));
|
|
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 14) ) ;
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 28));
|
|
w0 = _mm256_srli_epi32(tmp,4);
|
|
_mm256_storeu_si256(compressed + 7, w1);
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 15) ) , 14));
|
|
_mm256_storeu_si256(compressed + 8, w0);
|
|
w1 = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 16) ) ;
|
|
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 17) ) ;
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 18));
|
|
w0 = _mm256_srli_epi32(tmp,14);
|
|
_mm256_storeu_si256(compressed + 9, w1);
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 18) ) , 4));
|
|
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 19) ) ;
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 22));
|
|
w1 = _mm256_srli_epi32(tmp,10);
|
|
_mm256_storeu_si256(compressed + 10, w0);
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 20) ) , 8));
|
|
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 21) ) ;
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 26));
|
|
w0 = _mm256_srli_epi32(tmp,6);
|
|
_mm256_storeu_si256(compressed + 11, w1);
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 22) ) , 12));
|
|
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 23) ) ;
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 30));
|
|
w1 = _mm256_srli_epi32(tmp,2);
|
|
_mm256_storeu_si256(compressed + 12, w0);
|
|
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 24) ) ;
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 16));
|
|
w0 = _mm256_srli_epi32(tmp,16);
|
|
_mm256_storeu_si256(compressed + 13, w1);
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 25) ) , 2));
|
|
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 26) ) ;
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 20));
|
|
w1 = _mm256_srli_epi32(tmp,12);
|
|
_mm256_storeu_si256(compressed + 14, w0);
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 27) ) , 6));
|
|
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 28) ) ;
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 24));
|
|
w0 = _mm256_srli_epi32(tmp,8);
|
|
_mm256_storeu_si256(compressed + 15, w1);
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 29) ) , 10));
|
|
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 30) ) ;
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 28));
|
|
w1 = _mm256_srli_epi32(tmp,4);
|
|
_mm256_storeu_si256(compressed + 16, w0);
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 31) ) , 14));
|
|
_mm256_storeu_si256(compressed + 17, w1);
|
|
}
|
|
|
|
|
|
/* we are going to pack 256 19-bit values, touching 19 256-bit words, using 304 bytes */
|
|
static void avxpackblockmask19(const uint32_t * pin, __m256i * compressed) {
|
|
/* we are going to touch 19 256-bit words */
|
|
__m256i w0, w1;
|
|
const __m256i * in = (const __m256i *) pin;
|
|
const __m256i mask = _mm256_set1_epi32(524287);
|
|
__m256i tmp; /* used to store inputs at word boundary */
|
|
w0 = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 0) ) ;
|
|
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 1) ) ;
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 19));
|
|
w1 = _mm256_srli_epi32(tmp,13);
|
|
_mm256_storeu_si256(compressed + 0, w0);
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 2) ) , 6));
|
|
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 3) ) ;
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 25));
|
|
w0 = _mm256_srli_epi32(tmp,7);
|
|
_mm256_storeu_si256(compressed + 1, w1);
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 4) ) , 12));
|
|
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 5) ) ;
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 31));
|
|
w1 = _mm256_srli_epi32(tmp,1);
|
|
_mm256_storeu_si256(compressed + 2, w0);
|
|
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 6) ) ;
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 18));
|
|
w0 = _mm256_srli_epi32(tmp,14);
|
|
_mm256_storeu_si256(compressed + 3, w1);
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 7) ) , 5));
|
|
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 8) ) ;
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 24));
|
|
w1 = _mm256_srli_epi32(tmp,8);
|
|
_mm256_storeu_si256(compressed + 4, w0);
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 9) ) , 11));
|
|
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 10) ) ;
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 30));
|
|
w0 = _mm256_srli_epi32(tmp,2);
|
|
_mm256_storeu_si256(compressed + 5, w1);
|
|
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 11) ) ;
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 17));
|
|
w1 = _mm256_srli_epi32(tmp,15);
|
|
_mm256_storeu_si256(compressed + 6, w0);
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 12) ) , 4));
|
|
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 13) ) ;
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 23));
|
|
w0 = _mm256_srli_epi32(tmp,9);
|
|
_mm256_storeu_si256(compressed + 7, w1);
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 14) ) , 10));
|
|
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 15) ) ;
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 29));
|
|
w1 = _mm256_srli_epi32(tmp,3);
|
|
_mm256_storeu_si256(compressed + 8, w0);
|
|
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 16) ) ;
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 16));
|
|
w0 = _mm256_srli_epi32(tmp,16);
|
|
_mm256_storeu_si256(compressed + 9, w1);
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 17) ) , 3));
|
|
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 18) ) ;
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 22));
|
|
w1 = _mm256_srli_epi32(tmp,10);
|
|
_mm256_storeu_si256(compressed + 10, w0);
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 19) ) , 9));
|
|
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 20) ) ;
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 28));
|
|
w0 = _mm256_srli_epi32(tmp,4);
|
|
_mm256_storeu_si256(compressed + 11, w1);
|
|
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 21) ) ;
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 15));
|
|
w1 = _mm256_srli_epi32(tmp,17);
|
|
_mm256_storeu_si256(compressed + 12, w0);
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 22) ) , 2));
|
|
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 23) ) ;
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 21));
|
|
w0 = _mm256_srli_epi32(tmp,11);
|
|
_mm256_storeu_si256(compressed + 13, w1);
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 24) ) , 8));
|
|
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 25) ) ;
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 27));
|
|
w1 = _mm256_srli_epi32(tmp,5);
|
|
_mm256_storeu_si256(compressed + 14, w0);
|
|
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 26) ) ;
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 14));
|
|
w0 = _mm256_srli_epi32(tmp,18);
|
|
_mm256_storeu_si256(compressed + 15, w1);
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 27) ) , 1));
|
|
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 28) ) ;
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 20));
|
|
w1 = _mm256_srli_epi32(tmp,12);
|
|
_mm256_storeu_si256(compressed + 16, w0);
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 29) ) , 7));
|
|
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 30) ) ;
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 26));
|
|
w0 = _mm256_srli_epi32(tmp,6);
|
|
_mm256_storeu_si256(compressed + 17, w1);
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 31) ) , 13));
|
|
_mm256_storeu_si256(compressed + 18, w0);
|
|
}
|
|
|
|
|
|
/* we are going to pack 256 20-bit values, touching 20 256-bit words, using 320 bytes */
|
|
static void avxpackblockmask20(const uint32_t * pin, __m256i * compressed) {
|
|
/* we are going to touch 20 256-bit words */
|
|
__m256i w0, w1;
|
|
const __m256i * in = (const __m256i *) pin;
|
|
const __m256i mask = _mm256_set1_epi32(1048575);
|
|
__m256i tmp; /* used to store inputs at word boundary */
|
|
w0 = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 0) ) ;
|
|
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 1) ) ;
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 20));
|
|
w1 = _mm256_srli_epi32(tmp,12);
|
|
_mm256_storeu_si256(compressed + 0, w0);
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 2) ) , 8));
|
|
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 3) ) ;
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 28));
|
|
w0 = _mm256_srli_epi32(tmp,4);
|
|
_mm256_storeu_si256(compressed + 1, w1);
|
|
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 4) ) ;
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 16));
|
|
w1 = _mm256_srli_epi32(tmp,16);
|
|
_mm256_storeu_si256(compressed + 2, w0);
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 5) ) , 4));
|
|
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 6) ) ;
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 24));
|
|
w0 = _mm256_srli_epi32(tmp,8);
|
|
_mm256_storeu_si256(compressed + 3, w1);
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 7) ) , 12));
|
|
_mm256_storeu_si256(compressed + 4, w0);
|
|
w1 = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 8) ) ;
|
|
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 9) ) ;
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 20));
|
|
w0 = _mm256_srli_epi32(tmp,12);
|
|
_mm256_storeu_si256(compressed + 5, w1);
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 10) ) , 8));
|
|
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 11) ) ;
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 28));
|
|
w1 = _mm256_srli_epi32(tmp,4);
|
|
_mm256_storeu_si256(compressed + 6, w0);
|
|
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 12) ) ;
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 16));
|
|
w0 = _mm256_srli_epi32(tmp,16);
|
|
_mm256_storeu_si256(compressed + 7, w1);
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 13) ) , 4));
|
|
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 14) ) ;
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 24));
|
|
w1 = _mm256_srli_epi32(tmp,8);
|
|
_mm256_storeu_si256(compressed + 8, w0);
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 15) ) , 12));
|
|
_mm256_storeu_si256(compressed + 9, w1);
|
|
w0 = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 16) ) ;
|
|
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 17) ) ;
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 20));
|
|
w1 = _mm256_srli_epi32(tmp,12);
|
|
_mm256_storeu_si256(compressed + 10, w0);
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 18) ) , 8));
|
|
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 19) ) ;
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 28));
|
|
w0 = _mm256_srli_epi32(tmp,4);
|
|
_mm256_storeu_si256(compressed + 11, w1);
|
|
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 20) ) ;
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 16));
|
|
w1 = _mm256_srli_epi32(tmp,16);
|
|
_mm256_storeu_si256(compressed + 12, w0);
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 21) ) , 4));
|
|
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 22) ) ;
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 24));
|
|
w0 = _mm256_srli_epi32(tmp,8);
|
|
_mm256_storeu_si256(compressed + 13, w1);
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 23) ) , 12));
|
|
_mm256_storeu_si256(compressed + 14, w0);
|
|
w1 = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 24) ) ;
|
|
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 25) ) ;
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 20));
|
|
w0 = _mm256_srli_epi32(tmp,12);
|
|
_mm256_storeu_si256(compressed + 15, w1);
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 26) ) , 8));
|
|
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 27) ) ;
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 28));
|
|
w1 = _mm256_srli_epi32(tmp,4);
|
|
_mm256_storeu_si256(compressed + 16, w0);
|
|
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 28) ) ;
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 16));
|
|
w0 = _mm256_srli_epi32(tmp,16);
|
|
_mm256_storeu_si256(compressed + 17, w1);
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 29) ) , 4));
|
|
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 30) ) ;
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 24));
|
|
w1 = _mm256_srli_epi32(tmp,8);
|
|
_mm256_storeu_si256(compressed + 18, w0);
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 31) ) , 12));
|
|
_mm256_storeu_si256(compressed + 19, w1);
|
|
}
|
|
|
|
|
|
/* we are going to pack 256 21-bit values, touching 21 256-bit words, using 336 bytes */
|
|
static void avxpackblockmask21(const uint32_t * pin, __m256i * compressed) {
|
|
/* we are going to touch 21 256-bit words */
|
|
__m256i w0, w1;
|
|
const __m256i * in = (const __m256i *) pin;
|
|
const __m256i mask = _mm256_set1_epi32(2097151);
|
|
__m256i tmp; /* used to store inputs at word boundary */
|
|
w0 = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 0) ) ;
|
|
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 1) ) ;
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 21));
|
|
w1 = _mm256_srli_epi32(tmp,11);
|
|
_mm256_storeu_si256(compressed + 0, w0);
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 2) ) , 10));
|
|
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 3) ) ;
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 31));
|
|
w0 = _mm256_srli_epi32(tmp,1);
|
|
_mm256_storeu_si256(compressed + 1, w1);
|
|
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 4) ) ;
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 20));
|
|
w1 = _mm256_srli_epi32(tmp,12);
|
|
_mm256_storeu_si256(compressed + 2, w0);
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 5) ) , 9));
|
|
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 6) ) ;
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 30));
|
|
w0 = _mm256_srli_epi32(tmp,2);
|
|
_mm256_storeu_si256(compressed + 3, w1);
|
|
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 7) ) ;
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 19));
|
|
w1 = _mm256_srli_epi32(tmp,13);
|
|
_mm256_storeu_si256(compressed + 4, w0);
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 8) ) , 8));
|
|
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 9) ) ;
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 29));
|
|
w0 = _mm256_srli_epi32(tmp,3);
|
|
_mm256_storeu_si256(compressed + 5, w1);
|
|
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 10) ) ;
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 18));
|
|
w1 = _mm256_srli_epi32(tmp,14);
|
|
_mm256_storeu_si256(compressed + 6, w0);
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 11) ) , 7));
|
|
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 12) ) ;
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 28));
|
|
w0 = _mm256_srli_epi32(tmp,4);
|
|
_mm256_storeu_si256(compressed + 7, w1);
|
|
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 13) ) ;
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 17));
|
|
w1 = _mm256_srli_epi32(tmp,15);
|
|
_mm256_storeu_si256(compressed + 8, w0);
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 14) ) , 6));
|
|
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 15) ) ;
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 27));
|
|
w0 = _mm256_srli_epi32(tmp,5);
|
|
_mm256_storeu_si256(compressed + 9, w1);
|
|
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 16) ) ;
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 16));
|
|
w1 = _mm256_srli_epi32(tmp,16);
|
|
_mm256_storeu_si256(compressed + 10, w0);
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 17) ) , 5));
|
|
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 18) ) ;
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 26));
|
|
w0 = _mm256_srli_epi32(tmp,6);
|
|
_mm256_storeu_si256(compressed + 11, w1);
|
|
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 19) ) ;
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 15));
|
|
w1 = _mm256_srli_epi32(tmp,17);
|
|
_mm256_storeu_si256(compressed + 12, w0);
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 20) ) , 4));
|
|
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 21) ) ;
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 25));
|
|
w0 = _mm256_srli_epi32(tmp,7);
|
|
_mm256_storeu_si256(compressed + 13, w1);
|
|
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 22) ) ;
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 14));
|
|
w1 = _mm256_srli_epi32(tmp,18);
|
|
_mm256_storeu_si256(compressed + 14, w0);
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 23) ) , 3));
|
|
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 24) ) ;
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 24));
|
|
w0 = _mm256_srli_epi32(tmp,8);
|
|
_mm256_storeu_si256(compressed + 15, w1);
|
|
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 25) ) ;
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 13));
|
|
w1 = _mm256_srli_epi32(tmp,19);
|
|
_mm256_storeu_si256(compressed + 16, w0);
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 26) ) , 2));
|
|
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 27) ) ;
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 23));
|
|
w0 = _mm256_srli_epi32(tmp,9);
|
|
_mm256_storeu_si256(compressed + 17, w1);
|
|
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 28) ) ;
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 12));
|
|
w1 = _mm256_srli_epi32(tmp,20);
|
|
_mm256_storeu_si256(compressed + 18, w0);
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 29) ) , 1));
|
|
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 30) ) ;
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 22));
|
|
w0 = _mm256_srli_epi32(tmp,10);
|
|
_mm256_storeu_si256(compressed + 19, w1);
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 31) ) , 11));
|
|
_mm256_storeu_si256(compressed + 20, w0);
|
|
}
|
|
|
|
|
|
/* we are going to pack 256 22-bit values, touching 22 256-bit words, using 352 bytes */
|
|
static void avxpackblockmask22(const uint32_t * pin, __m256i * compressed) {
|
|
/* we are going to touch 22 256-bit words */
|
|
__m256i w0, w1;
|
|
const __m256i * in = (const __m256i *) pin;
|
|
const __m256i mask = _mm256_set1_epi32(4194303);
|
|
__m256i tmp; /* used to store inputs at word boundary */
|
|
w0 = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 0) ) ;
|
|
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 1) ) ;
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 22));
|
|
w1 = _mm256_srli_epi32(tmp,10);
|
|
_mm256_storeu_si256(compressed + 0, w0);
|
|
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 2) ) ;
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 12));
|
|
w0 = _mm256_srli_epi32(tmp,20);
|
|
_mm256_storeu_si256(compressed + 1, w1);
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 3) ) , 2));
|
|
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 4) ) ;
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 24));
|
|
w1 = _mm256_srli_epi32(tmp,8);
|
|
_mm256_storeu_si256(compressed + 2, w0);
|
|
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 5) ) ;
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 14));
|
|
w0 = _mm256_srli_epi32(tmp,18);
|
|
_mm256_storeu_si256(compressed + 3, w1);
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 6) ) , 4));
|
|
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 7) ) ;
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 26));
|
|
w1 = _mm256_srli_epi32(tmp,6);
|
|
_mm256_storeu_si256(compressed + 4, w0);
|
|
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 8) ) ;
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 16));
|
|
w0 = _mm256_srli_epi32(tmp,16);
|
|
_mm256_storeu_si256(compressed + 5, w1);
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 9) ) , 6));
|
|
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 10) ) ;
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 28));
|
|
w1 = _mm256_srli_epi32(tmp,4);
|
|
_mm256_storeu_si256(compressed + 6, w0);
|
|
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 11) ) ;
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 18));
|
|
w0 = _mm256_srli_epi32(tmp,14);
|
|
_mm256_storeu_si256(compressed + 7, w1);
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 12) ) , 8));
|
|
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 13) ) ;
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 30));
|
|
w1 = _mm256_srli_epi32(tmp,2);
|
|
_mm256_storeu_si256(compressed + 8, w0);
|
|
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 14) ) ;
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 20));
|
|
w0 = _mm256_srli_epi32(tmp,12);
|
|
_mm256_storeu_si256(compressed + 9, w1);
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 15) ) , 10));
|
|
_mm256_storeu_si256(compressed + 10, w0);
|
|
w1 = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 16) ) ;
|
|
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 17) ) ;
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 22));
|
|
w0 = _mm256_srli_epi32(tmp,10);
|
|
_mm256_storeu_si256(compressed + 11, w1);
|
|
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 18) ) ;
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 12));
|
|
w1 = _mm256_srli_epi32(tmp,20);
|
|
_mm256_storeu_si256(compressed + 12, w0);
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 19) ) , 2));
|
|
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 20) ) ;
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 24));
|
|
w0 = _mm256_srli_epi32(tmp,8);
|
|
_mm256_storeu_si256(compressed + 13, w1);
|
|
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 21) ) ;
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 14));
|
|
w1 = _mm256_srli_epi32(tmp,18);
|
|
_mm256_storeu_si256(compressed + 14, w0);
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 22) ) , 4));
|
|
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 23) ) ;
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 26));
|
|
w0 = _mm256_srli_epi32(tmp,6);
|
|
_mm256_storeu_si256(compressed + 15, w1);
|
|
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 24) ) ;
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 16));
|
|
w1 = _mm256_srli_epi32(tmp,16);
|
|
_mm256_storeu_si256(compressed + 16, w0);
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 25) ) , 6));
|
|
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 26) ) ;
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 28));
|
|
w0 = _mm256_srli_epi32(tmp,4);
|
|
_mm256_storeu_si256(compressed + 17, w1);
|
|
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 27) ) ;
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 18));
|
|
w1 = _mm256_srli_epi32(tmp,14);
|
|
_mm256_storeu_si256(compressed + 18, w0);
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 28) ) , 8));
|
|
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 29) ) ;
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 30));
|
|
w0 = _mm256_srli_epi32(tmp,2);
|
|
_mm256_storeu_si256(compressed + 19, w1);
|
|
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 30) ) ;
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 20));
|
|
w1 = _mm256_srli_epi32(tmp,12);
|
|
_mm256_storeu_si256(compressed + 20, w0);
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 31) ) , 10));
|
|
_mm256_storeu_si256(compressed + 21, w1);
|
|
}
|
|
|
|
|
|
/* we are going to pack 256 23-bit values, touching 23 256-bit words, using 368 bytes */
|
|
static void avxpackblockmask23(const uint32_t * pin, __m256i * compressed) {
|
|
/* we are going to touch 23 256-bit words */
|
|
__m256i w0, w1;
|
|
const __m256i * in = (const __m256i *) pin;
|
|
const __m256i mask = _mm256_set1_epi32(8388607);
|
|
__m256i tmp; /* used to store inputs at word boundary */
|
|
w0 = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 0) ) ;
|
|
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 1) ) ;
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 23));
|
|
w1 = _mm256_srli_epi32(tmp,9);
|
|
_mm256_storeu_si256(compressed + 0, w0);
|
|
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 2) ) ;
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 14));
|
|
w0 = _mm256_srli_epi32(tmp,18);
|
|
_mm256_storeu_si256(compressed + 1, w1);
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 3) ) , 5));
|
|
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 4) ) ;
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 28));
|
|
w1 = _mm256_srli_epi32(tmp,4);
|
|
_mm256_storeu_si256(compressed + 2, w0);
|
|
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 5) ) ;
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 19));
|
|
w0 = _mm256_srli_epi32(tmp,13);
|
|
_mm256_storeu_si256(compressed + 3, w1);
|
|
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 6) ) ;
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 10));
|
|
w1 = _mm256_srli_epi32(tmp,22);
|
|
_mm256_storeu_si256(compressed + 4, w0);
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 7) ) , 1));
|
|
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 8) ) ;
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 24));
|
|
w0 = _mm256_srli_epi32(tmp,8);
|
|
_mm256_storeu_si256(compressed + 5, w1);
|
|
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 9) ) ;
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 15));
|
|
w1 = _mm256_srli_epi32(tmp,17);
|
|
_mm256_storeu_si256(compressed + 6, w0);
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 10) ) , 6));
|
|
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 11) ) ;
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 29));
|
|
w0 = _mm256_srli_epi32(tmp,3);
|
|
_mm256_storeu_si256(compressed + 7, w1);
|
|
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 12) ) ;
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 20));
|
|
w1 = _mm256_srli_epi32(tmp,12);
|
|
_mm256_storeu_si256(compressed + 8, w0);
|
|
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 13) ) ;
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 11));
|
|
w0 = _mm256_srli_epi32(tmp,21);
|
|
_mm256_storeu_si256(compressed + 9, w1);
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 14) ) , 2));
|
|
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 15) ) ;
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 25));
|
|
w1 = _mm256_srli_epi32(tmp,7);
|
|
_mm256_storeu_si256(compressed + 10, w0);
|
|
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 16) ) ;
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 16));
|
|
w0 = _mm256_srli_epi32(tmp,16);
|
|
_mm256_storeu_si256(compressed + 11, w1);
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 17) ) , 7));
|
|
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 18) ) ;
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 30));
|
|
w1 = _mm256_srli_epi32(tmp,2);
|
|
_mm256_storeu_si256(compressed + 12, w0);
|
|
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 19) ) ;
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 21));
|
|
w0 = _mm256_srli_epi32(tmp,11);
|
|
_mm256_storeu_si256(compressed + 13, w1);
|
|
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 20) ) ;
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 12));
|
|
w1 = _mm256_srli_epi32(tmp,20);
|
|
_mm256_storeu_si256(compressed + 14, w0);
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 21) ) , 3));
|
|
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 22) ) ;
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 26));
|
|
w0 = _mm256_srli_epi32(tmp,6);
|
|
_mm256_storeu_si256(compressed + 15, w1);
|
|
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 23) ) ;
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 17));
|
|
w1 = _mm256_srli_epi32(tmp,15);
|
|
_mm256_storeu_si256(compressed + 16, w0);
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 24) ) , 8));
|
|
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 25) ) ;
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 31));
|
|
w0 = _mm256_srli_epi32(tmp,1);
|
|
_mm256_storeu_si256(compressed + 17, w1);
|
|
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 26) ) ;
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 22));
|
|
w1 = _mm256_srli_epi32(tmp,10);
|
|
_mm256_storeu_si256(compressed + 18, w0);
|
|
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 27) ) ;
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 13));
|
|
w0 = _mm256_srli_epi32(tmp,19);
|
|
_mm256_storeu_si256(compressed + 19, w1);
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 28) ) , 4));
|
|
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 29) ) ;
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 27));
|
|
w1 = _mm256_srli_epi32(tmp,5);
|
|
_mm256_storeu_si256(compressed + 20, w0);
|
|
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 30) ) ;
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 18));
|
|
w0 = _mm256_srli_epi32(tmp,14);
|
|
_mm256_storeu_si256(compressed + 21, w1);
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 31) ) , 9));
|
|
_mm256_storeu_si256(compressed + 22, w0);
|
|
}
|
|
|
|
|
|
/* we are going to pack 256 24-bit values, touching 24 256-bit words, using 384 bytes */
|
|
static void avxpackblockmask24(const uint32_t * pin, __m256i * compressed) {
|
|
/* we are going to touch 24 256-bit words */
|
|
__m256i w0, w1;
|
|
const __m256i * in = (const __m256i *) pin;
|
|
const __m256i mask = _mm256_set1_epi32(16777215);
|
|
__m256i tmp; /* used to store inputs at word boundary */
|
|
w0 = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 0) ) ;
|
|
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 1) ) ;
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 24));
|
|
w1 = _mm256_srli_epi32(tmp,8);
|
|
_mm256_storeu_si256(compressed + 0, w0);
|
|
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 2) ) ;
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 16));
|
|
w0 = _mm256_srli_epi32(tmp,16);
|
|
_mm256_storeu_si256(compressed + 1, w1);
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 3) ) , 8));
|
|
_mm256_storeu_si256(compressed + 2, w0);
|
|
w1 = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 4) ) ;
|
|
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 5) ) ;
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 24));
|
|
w0 = _mm256_srli_epi32(tmp,8);
|
|
_mm256_storeu_si256(compressed + 3, w1);
|
|
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 6) ) ;
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 16));
|
|
w1 = _mm256_srli_epi32(tmp,16);
|
|
_mm256_storeu_si256(compressed + 4, w0);
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 7) ) , 8));
|
|
_mm256_storeu_si256(compressed + 5, w1);
|
|
w0 = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 8) ) ;
|
|
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 9) ) ;
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 24));
|
|
w1 = _mm256_srli_epi32(tmp,8);
|
|
_mm256_storeu_si256(compressed + 6, w0);
|
|
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 10) ) ;
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 16));
|
|
w0 = _mm256_srli_epi32(tmp,16);
|
|
_mm256_storeu_si256(compressed + 7, w1);
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 11) ) , 8));
|
|
_mm256_storeu_si256(compressed + 8, w0);
|
|
w1 = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 12) ) ;
|
|
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 13) ) ;
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 24));
|
|
w0 = _mm256_srli_epi32(tmp,8);
|
|
_mm256_storeu_si256(compressed + 9, w1);
|
|
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 14) ) ;
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 16));
|
|
w1 = _mm256_srli_epi32(tmp,16);
|
|
_mm256_storeu_si256(compressed + 10, w0);
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 15) ) , 8));
|
|
_mm256_storeu_si256(compressed + 11, w1);
|
|
w0 = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 16) ) ;
|
|
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 17) ) ;
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 24));
|
|
w1 = _mm256_srli_epi32(tmp,8);
|
|
_mm256_storeu_si256(compressed + 12, w0);
|
|
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 18) ) ;
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 16));
|
|
w0 = _mm256_srli_epi32(tmp,16);
|
|
_mm256_storeu_si256(compressed + 13, w1);
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 19) ) , 8));
|
|
_mm256_storeu_si256(compressed + 14, w0);
|
|
w1 = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 20) ) ;
|
|
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 21) ) ;
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 24));
|
|
w0 = _mm256_srli_epi32(tmp,8);
|
|
_mm256_storeu_si256(compressed + 15, w1);
|
|
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 22) ) ;
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 16));
|
|
w1 = _mm256_srli_epi32(tmp,16);
|
|
_mm256_storeu_si256(compressed + 16, w0);
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 23) ) , 8));
|
|
_mm256_storeu_si256(compressed + 17, w1);
|
|
w0 = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 24) ) ;
|
|
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 25) ) ;
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 24));
|
|
w1 = _mm256_srli_epi32(tmp,8);
|
|
_mm256_storeu_si256(compressed + 18, w0);
|
|
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 26) ) ;
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 16));
|
|
w0 = _mm256_srli_epi32(tmp,16);
|
|
_mm256_storeu_si256(compressed + 19, w1);
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 27) ) , 8));
|
|
_mm256_storeu_si256(compressed + 20, w0);
|
|
w1 = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 28) ) ;
|
|
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 29) ) ;
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 24));
|
|
w0 = _mm256_srli_epi32(tmp,8);
|
|
_mm256_storeu_si256(compressed + 21, w1);
|
|
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 30) ) ;
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 16));
|
|
w1 = _mm256_srli_epi32(tmp,16);
|
|
_mm256_storeu_si256(compressed + 22, w0);
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 31) ) , 8));
|
|
_mm256_storeu_si256(compressed + 23, w1);
|
|
}
|
|
|
|
|
|
/* we are going to pack 256 25-bit values, touching 25 256-bit words, using 400 bytes */
|
|
static void avxpackblockmask25(const uint32_t * pin, __m256i * compressed) {
|
|
/* we are going to touch 25 256-bit words */
|
|
__m256i w0, w1;
|
|
const __m256i * in = (const __m256i *) pin;
|
|
const __m256i mask = _mm256_set1_epi32(33554431);
|
|
__m256i tmp; /* used to store inputs at word boundary */
|
|
w0 = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 0) ) ;
|
|
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 1) ) ;
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 25));
|
|
w1 = _mm256_srli_epi32(tmp,7);
|
|
_mm256_storeu_si256(compressed + 0, w0);
|
|
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 2) ) ;
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 18));
|
|
w0 = _mm256_srli_epi32(tmp,14);
|
|
_mm256_storeu_si256(compressed + 1, w1);
|
|
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 3) ) ;
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 11));
|
|
w1 = _mm256_srli_epi32(tmp,21);
|
|
_mm256_storeu_si256(compressed + 2, w0);
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 4) ) , 4));
|
|
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 5) ) ;
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 29));
|
|
w0 = _mm256_srli_epi32(tmp,3);
|
|
_mm256_storeu_si256(compressed + 3, w1);
|
|
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 6) ) ;
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 22));
|
|
w1 = _mm256_srli_epi32(tmp,10);
|
|
_mm256_storeu_si256(compressed + 4, w0);
|
|
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 7) ) ;
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 15));
|
|
w0 = _mm256_srli_epi32(tmp,17);
|
|
_mm256_storeu_si256(compressed + 5, w1);
|
|
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 8) ) ;
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 8));
|
|
w1 = _mm256_srli_epi32(tmp,24);
|
|
_mm256_storeu_si256(compressed + 6, w0);
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 9) ) , 1));
|
|
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 10) ) ;
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 26));
|
|
w0 = _mm256_srli_epi32(tmp,6);
|
|
_mm256_storeu_si256(compressed + 7, w1);
|
|
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 11) ) ;
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 19));
|
|
w1 = _mm256_srli_epi32(tmp,13);
|
|
_mm256_storeu_si256(compressed + 8, w0);
|
|
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 12) ) ;
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 12));
|
|
w0 = _mm256_srli_epi32(tmp,20);
|
|
_mm256_storeu_si256(compressed + 9, w1);
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 13) ) , 5));
|
|
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 14) ) ;
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 30));
|
|
w1 = _mm256_srli_epi32(tmp,2);
|
|
_mm256_storeu_si256(compressed + 10, w0);
|
|
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 15) ) ;
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 23));
|
|
w0 = _mm256_srli_epi32(tmp,9);
|
|
_mm256_storeu_si256(compressed + 11, w1);
|
|
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 16) ) ;
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 16));
|
|
w1 = _mm256_srli_epi32(tmp,16);
|
|
_mm256_storeu_si256(compressed + 12, w0);
|
|
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 17) ) ;
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 9));
|
|
w0 = _mm256_srli_epi32(tmp,23);
|
|
_mm256_storeu_si256(compressed + 13, w1);
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 18) ) , 2));
|
|
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 19) ) ;
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 27));
|
|
w1 = _mm256_srli_epi32(tmp,5);
|
|
_mm256_storeu_si256(compressed + 14, w0);
|
|
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 20) ) ;
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 20));
|
|
w0 = _mm256_srli_epi32(tmp,12);
|
|
_mm256_storeu_si256(compressed + 15, w1);
|
|
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 21) ) ;
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 13));
|
|
w1 = _mm256_srli_epi32(tmp,19);
|
|
_mm256_storeu_si256(compressed + 16, w0);
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 22) ) , 6));
|
|
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 23) ) ;
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 31));
|
|
w0 = _mm256_srli_epi32(tmp,1);
|
|
_mm256_storeu_si256(compressed + 17, w1);
|
|
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 24) ) ;
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 24));
|
|
w1 = _mm256_srli_epi32(tmp,8);
|
|
_mm256_storeu_si256(compressed + 18, w0);
|
|
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 25) ) ;
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 17));
|
|
w0 = _mm256_srli_epi32(tmp,15);
|
|
_mm256_storeu_si256(compressed + 19, w1);
|
|
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 26) ) ;
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 10));
|
|
w1 = _mm256_srli_epi32(tmp,22);
|
|
_mm256_storeu_si256(compressed + 20, w0);
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 27) ) , 3));
|
|
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 28) ) ;
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 28));
|
|
w0 = _mm256_srli_epi32(tmp,4);
|
|
_mm256_storeu_si256(compressed + 21, w1);
|
|
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 29) ) ;
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 21));
|
|
w1 = _mm256_srli_epi32(tmp,11);
|
|
_mm256_storeu_si256(compressed + 22, w0);
|
|
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 30) ) ;
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 14));
|
|
w0 = _mm256_srli_epi32(tmp,18);
|
|
_mm256_storeu_si256(compressed + 23, w1);
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 31) ) , 7));
|
|
_mm256_storeu_si256(compressed + 24, w0);
|
|
}
|
|
|
|
|
|
/* we are going to pack 256 26-bit values, touching 26 256-bit words, using 416 bytes */
|
|
static void avxpackblockmask26(const uint32_t * pin, __m256i * compressed) {
|
|
/* we are going to touch 26 256-bit words */
|
|
__m256i w0, w1;
|
|
const __m256i * in = (const __m256i *) pin;
|
|
const __m256i mask = _mm256_set1_epi32(67108863);
|
|
__m256i tmp; /* used to store inputs at word boundary */
|
|
w0 = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 0) ) ;
|
|
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 1) ) ;
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 26));
|
|
w1 = _mm256_srli_epi32(tmp,6);
|
|
_mm256_storeu_si256(compressed + 0, w0);
|
|
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 2) ) ;
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 20));
|
|
w0 = _mm256_srli_epi32(tmp,12);
|
|
_mm256_storeu_si256(compressed + 1, w1);
|
|
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 3) ) ;
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 14));
|
|
w1 = _mm256_srli_epi32(tmp,18);
|
|
_mm256_storeu_si256(compressed + 2, w0);
|
|
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 4) ) ;
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 8));
|
|
w0 = _mm256_srli_epi32(tmp,24);
|
|
_mm256_storeu_si256(compressed + 3, w1);
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 5) ) , 2));
|
|
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 6) ) ;
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 28));
|
|
w1 = _mm256_srli_epi32(tmp,4);
|
|
_mm256_storeu_si256(compressed + 4, w0);
|
|
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 7) ) ;
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 22));
|
|
w0 = _mm256_srli_epi32(tmp,10);
|
|
_mm256_storeu_si256(compressed + 5, w1);
|
|
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 8) ) ;
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 16));
|
|
w1 = _mm256_srli_epi32(tmp,16);
|
|
_mm256_storeu_si256(compressed + 6, w0);
|
|
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 9) ) ;
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 10));
|
|
w0 = _mm256_srli_epi32(tmp,22);
|
|
_mm256_storeu_si256(compressed + 7, w1);
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 10) ) , 4));
|
|
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 11) ) ;
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 30));
|
|
w1 = _mm256_srli_epi32(tmp,2);
|
|
_mm256_storeu_si256(compressed + 8, w0);
|
|
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 12) ) ;
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 24));
|
|
w0 = _mm256_srli_epi32(tmp,8);
|
|
_mm256_storeu_si256(compressed + 9, w1);
|
|
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 13) ) ;
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 18));
|
|
w1 = _mm256_srli_epi32(tmp,14);
|
|
_mm256_storeu_si256(compressed + 10, w0);
|
|
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 14) ) ;
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 12));
|
|
w0 = _mm256_srli_epi32(tmp,20);
|
|
_mm256_storeu_si256(compressed + 11, w1);
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 15) ) , 6));
|
|
_mm256_storeu_si256(compressed + 12, w0);
|
|
w1 = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 16) ) ;
|
|
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 17) ) ;
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 26));
|
|
w0 = _mm256_srli_epi32(tmp,6);
|
|
_mm256_storeu_si256(compressed + 13, w1);
|
|
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 18) ) ;
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 20));
|
|
w1 = _mm256_srli_epi32(tmp,12);
|
|
_mm256_storeu_si256(compressed + 14, w0);
|
|
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 19) ) ;
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 14));
|
|
w0 = _mm256_srli_epi32(tmp,18);
|
|
_mm256_storeu_si256(compressed + 15, w1);
|
|
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 20) ) ;
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 8));
|
|
w1 = _mm256_srli_epi32(tmp,24);
|
|
_mm256_storeu_si256(compressed + 16, w0);
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 21) ) , 2));
|
|
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 22) ) ;
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 28));
|
|
w0 = _mm256_srli_epi32(tmp,4);
|
|
_mm256_storeu_si256(compressed + 17, w1);
|
|
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 23) ) ;
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 22));
|
|
w1 = _mm256_srli_epi32(tmp,10);
|
|
_mm256_storeu_si256(compressed + 18, w0);
|
|
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 24) ) ;
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 16));
|
|
w0 = _mm256_srli_epi32(tmp,16);
|
|
_mm256_storeu_si256(compressed + 19, w1);
|
|
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 25) ) ;
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 10));
|
|
w1 = _mm256_srli_epi32(tmp,22);
|
|
_mm256_storeu_si256(compressed + 20, w0);
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 26) ) , 4));
|
|
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 27) ) ;
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 30));
|
|
w0 = _mm256_srli_epi32(tmp,2);
|
|
_mm256_storeu_si256(compressed + 21, w1);
|
|
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 28) ) ;
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 24));
|
|
w1 = _mm256_srli_epi32(tmp,8);
|
|
_mm256_storeu_si256(compressed + 22, w0);
|
|
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 29) ) ;
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 18));
|
|
w0 = _mm256_srli_epi32(tmp,14);
|
|
_mm256_storeu_si256(compressed + 23, w1);
|
|
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 30) ) ;
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 12));
|
|
w1 = _mm256_srli_epi32(tmp,20);
|
|
_mm256_storeu_si256(compressed + 24, w0);
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 31) ) , 6));
|
|
_mm256_storeu_si256(compressed + 25, w1);
|
|
}
|
|
|
|
|
|
/* we are going to pack 256 27-bit values, touching 27 256-bit words, using 432 bytes */
|
|
static void avxpackblockmask27(const uint32_t * pin, __m256i * compressed) {
|
|
/* we are going to touch 27 256-bit words */
|
|
__m256i w0, w1;
|
|
const __m256i * in = (const __m256i *) pin;
|
|
const __m256i mask = _mm256_set1_epi32(134217727);
|
|
__m256i tmp; /* used to store inputs at word boundary */
|
|
w0 = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 0) ) ;
|
|
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 1) ) ;
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 27));
|
|
w1 = _mm256_srli_epi32(tmp,5);
|
|
_mm256_storeu_si256(compressed + 0, w0);
|
|
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 2) ) ;
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 22));
|
|
w0 = _mm256_srli_epi32(tmp,10);
|
|
_mm256_storeu_si256(compressed + 1, w1);
|
|
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 3) ) ;
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 17));
|
|
w1 = _mm256_srli_epi32(tmp,15);
|
|
_mm256_storeu_si256(compressed + 2, w0);
|
|
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 4) ) ;
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 12));
|
|
w0 = _mm256_srli_epi32(tmp,20);
|
|
_mm256_storeu_si256(compressed + 3, w1);
|
|
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 5) ) ;
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 7));
|
|
w1 = _mm256_srli_epi32(tmp,25);
|
|
_mm256_storeu_si256(compressed + 4, w0);
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 6) ) , 2));
|
|
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 7) ) ;
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 29));
|
|
w0 = _mm256_srli_epi32(tmp,3);
|
|
_mm256_storeu_si256(compressed + 5, w1);
|
|
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 8) ) ;
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 24));
|
|
w1 = _mm256_srli_epi32(tmp,8);
|
|
_mm256_storeu_si256(compressed + 6, w0);
|
|
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 9) ) ;
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 19));
|
|
w0 = _mm256_srli_epi32(tmp,13);
|
|
_mm256_storeu_si256(compressed + 7, w1);
|
|
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 10) ) ;
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 14));
|
|
w1 = _mm256_srli_epi32(tmp,18);
|
|
_mm256_storeu_si256(compressed + 8, w0);
|
|
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 11) ) ;
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 9));
|
|
w0 = _mm256_srli_epi32(tmp,23);
|
|
_mm256_storeu_si256(compressed + 9, w1);
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 12) ) , 4));
|
|
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 13) ) ;
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 31));
|
|
w1 = _mm256_srli_epi32(tmp,1);
|
|
_mm256_storeu_si256(compressed + 10, w0);
|
|
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 14) ) ;
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 26));
|
|
w0 = _mm256_srli_epi32(tmp,6);
|
|
_mm256_storeu_si256(compressed + 11, w1);
|
|
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 15) ) ;
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 21));
|
|
w1 = _mm256_srli_epi32(tmp,11);
|
|
_mm256_storeu_si256(compressed + 12, w0);
|
|
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 16) ) ;
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 16));
|
|
w0 = _mm256_srli_epi32(tmp,16);
|
|
_mm256_storeu_si256(compressed + 13, w1);
|
|
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 17) ) ;
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 11));
|
|
w1 = _mm256_srli_epi32(tmp,21);
|
|
_mm256_storeu_si256(compressed + 14, w0);
|
|
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 18) ) ;
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 6));
|
|
w0 = _mm256_srli_epi32(tmp,26);
|
|
_mm256_storeu_si256(compressed + 15, w1);
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 19) ) , 1));
|
|
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 20) ) ;
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 28));
|
|
w1 = _mm256_srli_epi32(tmp,4);
|
|
_mm256_storeu_si256(compressed + 16, w0);
|
|
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 21) ) ;
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 23));
|
|
w0 = _mm256_srli_epi32(tmp,9);
|
|
_mm256_storeu_si256(compressed + 17, w1);
|
|
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 22) ) ;
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 18));
|
|
w1 = _mm256_srli_epi32(tmp,14);
|
|
_mm256_storeu_si256(compressed + 18, w0);
|
|
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 23) ) ;
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 13));
|
|
w0 = _mm256_srli_epi32(tmp,19);
|
|
_mm256_storeu_si256(compressed + 19, w1);
|
|
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 24) ) ;
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 8));
|
|
w1 = _mm256_srli_epi32(tmp,24);
|
|
_mm256_storeu_si256(compressed + 20, w0);
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 25) ) , 3));
|
|
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 26) ) ;
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 30));
|
|
w0 = _mm256_srli_epi32(tmp,2);
|
|
_mm256_storeu_si256(compressed + 21, w1);
|
|
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 27) ) ;
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 25));
|
|
w1 = _mm256_srli_epi32(tmp,7);
|
|
_mm256_storeu_si256(compressed + 22, w0);
|
|
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 28) ) ;
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 20));
|
|
w0 = _mm256_srli_epi32(tmp,12);
|
|
_mm256_storeu_si256(compressed + 23, w1);
|
|
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 29) ) ;
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 15));
|
|
w1 = _mm256_srli_epi32(tmp,17);
|
|
_mm256_storeu_si256(compressed + 24, w0);
|
|
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 30) ) ;
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 10));
|
|
w0 = _mm256_srli_epi32(tmp,22);
|
|
_mm256_storeu_si256(compressed + 25, w1);
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 31) ) , 5));
|
|
_mm256_storeu_si256(compressed + 26, w0);
|
|
}
|
|
|
|
|
|
/* we are going to pack 256 28-bit values, touching 28 256-bit words, using 448 bytes */
|
|
static void avxpackblockmask28(const uint32_t * pin, __m256i * compressed) {
|
|
/* we are going to touch 28 256-bit words */
|
|
__m256i w0, w1;
|
|
const __m256i * in = (const __m256i *) pin;
|
|
const __m256i mask = _mm256_set1_epi32(268435455);
|
|
__m256i tmp; /* used to store inputs at word boundary */
|
|
w0 = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 0) ) ;
|
|
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 1) ) ;
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 28));
|
|
w1 = _mm256_srli_epi32(tmp,4);
|
|
_mm256_storeu_si256(compressed + 0, w0);
|
|
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 2) ) ;
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 24));
|
|
w0 = _mm256_srli_epi32(tmp,8);
|
|
_mm256_storeu_si256(compressed + 1, w1);
|
|
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 3) ) ;
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 20));
|
|
w1 = _mm256_srli_epi32(tmp,12);
|
|
_mm256_storeu_si256(compressed + 2, w0);
|
|
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 4) ) ;
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 16));
|
|
w0 = _mm256_srli_epi32(tmp,16);
|
|
_mm256_storeu_si256(compressed + 3, w1);
|
|
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 5) ) ;
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 12));
|
|
w1 = _mm256_srli_epi32(tmp,20);
|
|
_mm256_storeu_si256(compressed + 4, w0);
|
|
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 6) ) ;
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 8));
|
|
w0 = _mm256_srli_epi32(tmp,24);
|
|
_mm256_storeu_si256(compressed + 5, w1);
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 7) ) , 4));
|
|
_mm256_storeu_si256(compressed + 6, w0);
|
|
w1 = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 8) ) ;
|
|
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 9) ) ;
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 28));
|
|
w0 = _mm256_srli_epi32(tmp,4);
|
|
_mm256_storeu_si256(compressed + 7, w1);
|
|
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 10) ) ;
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 24));
|
|
w1 = _mm256_srli_epi32(tmp,8);
|
|
_mm256_storeu_si256(compressed + 8, w0);
|
|
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 11) ) ;
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 20));
|
|
w0 = _mm256_srli_epi32(tmp,12);
|
|
_mm256_storeu_si256(compressed + 9, w1);
|
|
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 12) ) ;
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 16));
|
|
w1 = _mm256_srli_epi32(tmp,16);
|
|
_mm256_storeu_si256(compressed + 10, w0);
|
|
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 13) ) ;
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 12));
|
|
w0 = _mm256_srli_epi32(tmp,20);
|
|
_mm256_storeu_si256(compressed + 11, w1);
|
|
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 14) ) ;
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 8));
|
|
w1 = _mm256_srli_epi32(tmp,24);
|
|
_mm256_storeu_si256(compressed + 12, w0);
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 15) ) , 4));
|
|
_mm256_storeu_si256(compressed + 13, w1);
|
|
w0 = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 16) ) ;
|
|
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 17) ) ;
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 28));
|
|
w1 = _mm256_srli_epi32(tmp,4);
|
|
_mm256_storeu_si256(compressed + 14, w0);
|
|
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 18) ) ;
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 24));
|
|
w0 = _mm256_srli_epi32(tmp,8);
|
|
_mm256_storeu_si256(compressed + 15, w1);
|
|
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 19) ) ;
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 20));
|
|
w1 = _mm256_srli_epi32(tmp,12);
|
|
_mm256_storeu_si256(compressed + 16, w0);
|
|
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 20) ) ;
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 16));
|
|
w0 = _mm256_srli_epi32(tmp,16);
|
|
_mm256_storeu_si256(compressed + 17, w1);
|
|
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 21) ) ;
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 12));
|
|
w1 = _mm256_srli_epi32(tmp,20);
|
|
_mm256_storeu_si256(compressed + 18, w0);
|
|
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 22) ) ;
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 8));
|
|
w0 = _mm256_srli_epi32(tmp,24);
|
|
_mm256_storeu_si256(compressed + 19, w1);
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 23) ) , 4));
|
|
_mm256_storeu_si256(compressed + 20, w0);
|
|
w1 = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 24) ) ;
|
|
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 25) ) ;
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 28));
|
|
w0 = _mm256_srli_epi32(tmp,4);
|
|
_mm256_storeu_si256(compressed + 21, w1);
|
|
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 26) ) ;
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 24));
|
|
w1 = _mm256_srli_epi32(tmp,8);
|
|
_mm256_storeu_si256(compressed + 22, w0);
|
|
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 27) ) ;
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 20));
|
|
w0 = _mm256_srli_epi32(tmp,12);
|
|
_mm256_storeu_si256(compressed + 23, w1);
|
|
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 28) ) ;
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 16));
|
|
w1 = _mm256_srli_epi32(tmp,16);
|
|
_mm256_storeu_si256(compressed + 24, w0);
|
|
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 29) ) ;
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 12));
|
|
w0 = _mm256_srli_epi32(tmp,20);
|
|
_mm256_storeu_si256(compressed + 25, w1);
|
|
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 30) ) ;
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 8));
|
|
w1 = _mm256_srli_epi32(tmp,24);
|
|
_mm256_storeu_si256(compressed + 26, w0);
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 31) ) , 4));
|
|
_mm256_storeu_si256(compressed + 27, w1);
|
|
}
|
|
|
|
|
|
/* we are going to pack 256 29-bit values, touching 29 256-bit words, using 464 bytes */
|
|
static void avxpackblockmask29(const uint32_t * pin, __m256i * compressed) {
|
|
/* we are going to touch 29 256-bit words */
|
|
__m256i w0, w1;
|
|
const __m256i * in = (const __m256i *) pin;
|
|
const __m256i mask = _mm256_set1_epi32(536870911);
|
|
__m256i tmp; /* used to store inputs at word boundary */
|
|
w0 = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 0) ) ;
|
|
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 1) ) ;
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 29));
|
|
w1 = _mm256_srli_epi32(tmp,3);
|
|
_mm256_storeu_si256(compressed + 0, w0);
|
|
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 2) ) ;
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 26));
|
|
w0 = _mm256_srli_epi32(tmp,6);
|
|
_mm256_storeu_si256(compressed + 1, w1);
|
|
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 3) ) ;
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 23));
|
|
w1 = _mm256_srli_epi32(tmp,9);
|
|
_mm256_storeu_si256(compressed + 2, w0);
|
|
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 4) ) ;
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 20));
|
|
w0 = _mm256_srli_epi32(tmp,12);
|
|
_mm256_storeu_si256(compressed + 3, w1);
|
|
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 5) ) ;
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 17));
|
|
w1 = _mm256_srli_epi32(tmp,15);
|
|
_mm256_storeu_si256(compressed + 4, w0);
|
|
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 6) ) ;
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 14));
|
|
w0 = _mm256_srli_epi32(tmp,18);
|
|
_mm256_storeu_si256(compressed + 5, w1);
|
|
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 7) ) ;
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 11));
|
|
w1 = _mm256_srli_epi32(tmp,21);
|
|
_mm256_storeu_si256(compressed + 6, w0);
|
|
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 8) ) ;
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 8));
|
|
w0 = _mm256_srli_epi32(tmp,24);
|
|
_mm256_storeu_si256(compressed + 7, w1);
|
|
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 9) ) ;
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 5));
|
|
w1 = _mm256_srli_epi32(tmp,27);
|
|
_mm256_storeu_si256(compressed + 8, w0);
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 10) ) , 2));
|
|
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 11) ) ;
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 31));
|
|
w0 = _mm256_srli_epi32(tmp,1);
|
|
_mm256_storeu_si256(compressed + 9, w1);
|
|
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 12) ) ;
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 28));
|
|
w1 = _mm256_srli_epi32(tmp,4);
|
|
_mm256_storeu_si256(compressed + 10, w0);
|
|
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 13) ) ;
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 25));
|
|
w0 = _mm256_srli_epi32(tmp,7);
|
|
_mm256_storeu_si256(compressed + 11, w1);
|
|
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 14) ) ;
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 22));
|
|
w1 = _mm256_srli_epi32(tmp,10);
|
|
_mm256_storeu_si256(compressed + 12, w0);
|
|
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 15) ) ;
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 19));
|
|
w0 = _mm256_srli_epi32(tmp,13);
|
|
_mm256_storeu_si256(compressed + 13, w1);
|
|
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 16) ) ;
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 16));
|
|
w1 = _mm256_srli_epi32(tmp,16);
|
|
_mm256_storeu_si256(compressed + 14, w0);
|
|
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 17) ) ;
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 13));
|
|
w0 = _mm256_srli_epi32(tmp,19);
|
|
_mm256_storeu_si256(compressed + 15, w1);
|
|
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 18) ) ;
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 10));
|
|
w1 = _mm256_srli_epi32(tmp,22);
|
|
_mm256_storeu_si256(compressed + 16, w0);
|
|
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 19) ) ;
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 7));
|
|
w0 = _mm256_srli_epi32(tmp,25);
|
|
_mm256_storeu_si256(compressed + 17, w1);
|
|
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 20) ) ;
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 4));
|
|
w1 = _mm256_srli_epi32(tmp,28);
|
|
_mm256_storeu_si256(compressed + 18, w0);
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 21) ) , 1));
|
|
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 22) ) ;
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 30));
|
|
w0 = _mm256_srli_epi32(tmp,2);
|
|
_mm256_storeu_si256(compressed + 19, w1);
|
|
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 23) ) ;
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 27));
|
|
w1 = _mm256_srli_epi32(tmp,5);
|
|
_mm256_storeu_si256(compressed + 20, w0);
|
|
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 24) ) ;
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 24));
|
|
w0 = _mm256_srli_epi32(tmp,8);
|
|
_mm256_storeu_si256(compressed + 21, w1);
|
|
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 25) ) ;
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 21));
|
|
w1 = _mm256_srli_epi32(tmp,11);
|
|
_mm256_storeu_si256(compressed + 22, w0);
|
|
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 26) ) ;
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 18));
|
|
w0 = _mm256_srli_epi32(tmp,14);
|
|
_mm256_storeu_si256(compressed + 23, w1);
|
|
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 27) ) ;
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 15));
|
|
w1 = _mm256_srli_epi32(tmp,17);
|
|
_mm256_storeu_si256(compressed + 24, w0);
|
|
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 28) ) ;
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 12));
|
|
w0 = _mm256_srli_epi32(tmp,20);
|
|
_mm256_storeu_si256(compressed + 25, w1);
|
|
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 29) ) ;
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 9));
|
|
w1 = _mm256_srli_epi32(tmp,23);
|
|
_mm256_storeu_si256(compressed + 26, w0);
|
|
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 30) ) ;
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 6));
|
|
w0 = _mm256_srli_epi32(tmp,26);
|
|
_mm256_storeu_si256(compressed + 27, w1);
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 31) ) , 3));
|
|
_mm256_storeu_si256(compressed + 28, w0);
|
|
}
|
|
|
|
|
|
/* we are going to pack 256 30-bit values, touching 30 256-bit words, using 480 bytes */
|
|
static void avxpackblockmask30(const uint32_t * pin, __m256i * compressed) {
|
|
/* we are going to touch 30 256-bit words */
|
|
__m256i w0, w1;
|
|
const __m256i * in = (const __m256i *) pin;
|
|
const __m256i mask = _mm256_set1_epi32(1073741823);
|
|
__m256i tmp; /* used to store inputs at word boundary */
|
|
w0 = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 0) ) ;
|
|
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 1) ) ;
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 30));
|
|
w1 = _mm256_srli_epi32(tmp,2);
|
|
_mm256_storeu_si256(compressed + 0, w0);
|
|
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 2) ) ;
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 28));
|
|
w0 = _mm256_srli_epi32(tmp,4);
|
|
_mm256_storeu_si256(compressed + 1, w1);
|
|
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 3) ) ;
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 26));
|
|
w1 = _mm256_srli_epi32(tmp,6);
|
|
_mm256_storeu_si256(compressed + 2, w0);
|
|
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 4) ) ;
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 24));
|
|
w0 = _mm256_srli_epi32(tmp,8);
|
|
_mm256_storeu_si256(compressed + 3, w1);
|
|
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 5) ) ;
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 22));
|
|
w1 = _mm256_srli_epi32(tmp,10);
|
|
_mm256_storeu_si256(compressed + 4, w0);
|
|
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 6) ) ;
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 20));
|
|
w0 = _mm256_srli_epi32(tmp,12);
|
|
_mm256_storeu_si256(compressed + 5, w1);
|
|
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 7) ) ;
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 18));
|
|
w1 = _mm256_srli_epi32(tmp,14);
|
|
_mm256_storeu_si256(compressed + 6, w0);
|
|
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 8) ) ;
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 16));
|
|
w0 = _mm256_srli_epi32(tmp,16);
|
|
_mm256_storeu_si256(compressed + 7, w1);
|
|
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 9) ) ;
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 14));
|
|
w1 = _mm256_srli_epi32(tmp,18);
|
|
_mm256_storeu_si256(compressed + 8, w0);
|
|
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 10) ) ;
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 12));
|
|
w0 = _mm256_srli_epi32(tmp,20);
|
|
_mm256_storeu_si256(compressed + 9, w1);
|
|
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 11) ) ;
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 10));
|
|
w1 = _mm256_srli_epi32(tmp,22);
|
|
_mm256_storeu_si256(compressed + 10, w0);
|
|
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 12) ) ;
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 8));
|
|
w0 = _mm256_srli_epi32(tmp,24);
|
|
_mm256_storeu_si256(compressed + 11, w1);
|
|
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 13) ) ;
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 6));
|
|
w1 = _mm256_srli_epi32(tmp,26);
|
|
_mm256_storeu_si256(compressed + 12, w0);
|
|
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 14) ) ;
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 4));
|
|
w0 = _mm256_srli_epi32(tmp,28);
|
|
_mm256_storeu_si256(compressed + 13, w1);
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 15) ) , 2));
|
|
_mm256_storeu_si256(compressed + 14, w0);
|
|
w1 = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 16) ) ;
|
|
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 17) ) ;
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 30));
|
|
w0 = _mm256_srli_epi32(tmp,2);
|
|
_mm256_storeu_si256(compressed + 15, w1);
|
|
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 18) ) ;
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 28));
|
|
w1 = _mm256_srli_epi32(tmp,4);
|
|
_mm256_storeu_si256(compressed + 16, w0);
|
|
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 19) ) ;
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 26));
|
|
w0 = _mm256_srli_epi32(tmp,6);
|
|
_mm256_storeu_si256(compressed + 17, w1);
|
|
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 20) ) ;
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 24));
|
|
w1 = _mm256_srli_epi32(tmp,8);
|
|
_mm256_storeu_si256(compressed + 18, w0);
|
|
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 21) ) ;
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 22));
|
|
w0 = _mm256_srli_epi32(tmp,10);
|
|
_mm256_storeu_si256(compressed + 19, w1);
|
|
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 22) ) ;
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 20));
|
|
w1 = _mm256_srli_epi32(tmp,12);
|
|
_mm256_storeu_si256(compressed + 20, w0);
|
|
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 23) ) ;
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 18));
|
|
w0 = _mm256_srli_epi32(tmp,14);
|
|
_mm256_storeu_si256(compressed + 21, w1);
|
|
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 24) ) ;
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 16));
|
|
w1 = _mm256_srli_epi32(tmp,16);
|
|
_mm256_storeu_si256(compressed + 22, w0);
|
|
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 25) ) ;
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 14));
|
|
w0 = _mm256_srli_epi32(tmp,18);
|
|
_mm256_storeu_si256(compressed + 23, w1);
|
|
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 26) ) ;
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 12));
|
|
w1 = _mm256_srli_epi32(tmp,20);
|
|
_mm256_storeu_si256(compressed + 24, w0);
|
|
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 27) ) ;
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 10));
|
|
w0 = _mm256_srli_epi32(tmp,22);
|
|
_mm256_storeu_si256(compressed + 25, w1);
|
|
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 28) ) ;
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 8));
|
|
w1 = _mm256_srli_epi32(tmp,24);
|
|
_mm256_storeu_si256(compressed + 26, w0);
|
|
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 29) ) ;
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 6));
|
|
w0 = _mm256_srli_epi32(tmp,26);
|
|
_mm256_storeu_si256(compressed + 27, w1);
|
|
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 30) ) ;
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 4));
|
|
w1 = _mm256_srli_epi32(tmp,28);
|
|
_mm256_storeu_si256(compressed + 28, w0);
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 31) ) , 2));
|
|
_mm256_storeu_si256(compressed + 29, w1);
|
|
}
|
|
|
|
|
|
/* we are going to pack 256 31-bit values, touching 31 256-bit words, using 496 bytes */
|
|
static void avxpackblockmask31(const uint32_t * pin, __m256i * compressed) {
|
|
/* we are going to touch 31 256-bit words */
|
|
__m256i w0, w1;
|
|
const __m256i * in = (const __m256i *) pin;
|
|
const __m256i mask = _mm256_set1_epi32(2147483647);
|
|
__m256i tmp; /* used to store inputs at word boundary */
|
|
w0 = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 0) ) ;
|
|
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 1) ) ;
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 31));
|
|
w1 = _mm256_srli_epi32(tmp,1);
|
|
_mm256_storeu_si256(compressed + 0, w0);
|
|
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 2) ) ;
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 30));
|
|
w0 = _mm256_srli_epi32(tmp,2);
|
|
_mm256_storeu_si256(compressed + 1, w1);
|
|
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 3) ) ;
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 29));
|
|
w1 = _mm256_srli_epi32(tmp,3);
|
|
_mm256_storeu_si256(compressed + 2, w0);
|
|
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 4) ) ;
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 28));
|
|
w0 = _mm256_srli_epi32(tmp,4);
|
|
_mm256_storeu_si256(compressed + 3, w1);
|
|
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 5) ) ;
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 27));
|
|
w1 = _mm256_srli_epi32(tmp,5);
|
|
_mm256_storeu_si256(compressed + 4, w0);
|
|
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 6) ) ;
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 26));
|
|
w0 = _mm256_srli_epi32(tmp,6);
|
|
_mm256_storeu_si256(compressed + 5, w1);
|
|
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 7) ) ;
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 25));
|
|
w1 = _mm256_srli_epi32(tmp,7);
|
|
_mm256_storeu_si256(compressed + 6, w0);
|
|
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 8) ) ;
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 24));
|
|
w0 = _mm256_srli_epi32(tmp,8);
|
|
_mm256_storeu_si256(compressed + 7, w1);
|
|
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 9) ) ;
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 23));
|
|
w1 = _mm256_srli_epi32(tmp,9);
|
|
_mm256_storeu_si256(compressed + 8, w0);
|
|
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 10) ) ;
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 22));
|
|
w0 = _mm256_srli_epi32(tmp,10);
|
|
_mm256_storeu_si256(compressed + 9, w1);
|
|
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 11) ) ;
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 21));
|
|
w1 = _mm256_srli_epi32(tmp,11);
|
|
_mm256_storeu_si256(compressed + 10, w0);
|
|
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 12) ) ;
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 20));
|
|
w0 = _mm256_srli_epi32(tmp,12);
|
|
_mm256_storeu_si256(compressed + 11, w1);
|
|
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 13) ) ;
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 19));
|
|
w1 = _mm256_srli_epi32(tmp,13);
|
|
_mm256_storeu_si256(compressed + 12, w0);
|
|
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 14) ) ;
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 18));
|
|
w0 = _mm256_srli_epi32(tmp,14);
|
|
_mm256_storeu_si256(compressed + 13, w1);
|
|
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 15) ) ;
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 17));
|
|
w1 = _mm256_srli_epi32(tmp,15);
|
|
_mm256_storeu_si256(compressed + 14, w0);
|
|
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 16) ) ;
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 16));
|
|
w0 = _mm256_srli_epi32(tmp,16);
|
|
_mm256_storeu_si256(compressed + 15, w1);
|
|
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 17) ) ;
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 15));
|
|
w1 = _mm256_srli_epi32(tmp,17);
|
|
_mm256_storeu_si256(compressed + 16, w0);
|
|
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 18) ) ;
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 14));
|
|
w0 = _mm256_srli_epi32(tmp,18);
|
|
_mm256_storeu_si256(compressed + 17, w1);
|
|
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 19) ) ;
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 13));
|
|
w1 = _mm256_srli_epi32(tmp,19);
|
|
_mm256_storeu_si256(compressed + 18, w0);
|
|
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 20) ) ;
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 12));
|
|
w0 = _mm256_srli_epi32(tmp,20);
|
|
_mm256_storeu_si256(compressed + 19, w1);
|
|
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 21) ) ;
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 11));
|
|
w1 = _mm256_srli_epi32(tmp,21);
|
|
_mm256_storeu_si256(compressed + 20, w0);
|
|
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 22) ) ;
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 10));
|
|
w0 = _mm256_srli_epi32(tmp,22);
|
|
_mm256_storeu_si256(compressed + 21, w1);
|
|
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 23) ) ;
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 9));
|
|
w1 = _mm256_srli_epi32(tmp,23);
|
|
_mm256_storeu_si256(compressed + 22, w0);
|
|
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 24) ) ;
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 8));
|
|
w0 = _mm256_srli_epi32(tmp,24);
|
|
_mm256_storeu_si256(compressed + 23, w1);
|
|
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 25) ) ;
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 7));
|
|
w1 = _mm256_srli_epi32(tmp,25);
|
|
_mm256_storeu_si256(compressed + 24, w0);
|
|
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 26) ) ;
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 6));
|
|
w0 = _mm256_srli_epi32(tmp,26);
|
|
_mm256_storeu_si256(compressed + 25, w1);
|
|
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 27) ) ;
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 5));
|
|
w1 = _mm256_srli_epi32(tmp,27);
|
|
_mm256_storeu_si256(compressed + 26, w0);
|
|
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 28) ) ;
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 4));
|
|
w0 = _mm256_srli_epi32(tmp,28);
|
|
_mm256_storeu_si256(compressed + 27, w1);
|
|
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 29) ) ;
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32(tmp , 3));
|
|
w1 = _mm256_srli_epi32(tmp,29);
|
|
_mm256_storeu_si256(compressed + 28, w0);
|
|
tmp = _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 30) ) ;
|
|
w1 = _mm256_or_si256(w1,_mm256_slli_epi32(tmp , 2));
|
|
w0 = _mm256_srli_epi32(tmp,30);
|
|
_mm256_storeu_si256(compressed + 29, w1);
|
|
w0 = _mm256_or_si256(w0,_mm256_slli_epi32( _mm256_and_si256 ( mask, _mm256_lddqu_si256 (in + 31) ) , 1));
|
|
_mm256_storeu_si256(compressed + 30, w0);
|
|
}
|
|
|
|
|
|
/* we are going to pack 256 32-bit values, touching 32 256-bit words, using 512 bytes */
|
|
static void avxpackblockmask32(const uint32_t * pin, __m256i * compressed) {
|
|
/* we are going to touch 32 256-bit words */
|
|
__m256i w0, w1;
|
|
const __m256i * in = (const __m256i *) pin;
|
|
w0 = _mm256_lddqu_si256 (in + 0) ;
|
|
_mm256_storeu_si256(compressed + 0, w0);
|
|
w1 = _mm256_lddqu_si256 (in + 1) ;
|
|
_mm256_storeu_si256(compressed + 1, w1);
|
|
w0 = _mm256_lddqu_si256 (in + 2) ;
|
|
_mm256_storeu_si256(compressed + 2, w0);
|
|
w1 = _mm256_lddqu_si256 (in + 3) ;
|
|
_mm256_storeu_si256(compressed + 3, w1);
|
|
w0 = _mm256_lddqu_si256 (in + 4) ;
|
|
_mm256_storeu_si256(compressed + 4, w0);
|
|
w1 = _mm256_lddqu_si256 (in + 5) ;
|
|
_mm256_storeu_si256(compressed + 5, w1);
|
|
w0 = _mm256_lddqu_si256 (in + 6) ;
|
|
_mm256_storeu_si256(compressed + 6, w0);
|
|
w1 = _mm256_lddqu_si256 (in + 7) ;
|
|
_mm256_storeu_si256(compressed + 7, w1);
|
|
w0 = _mm256_lddqu_si256 (in + 8) ;
|
|
_mm256_storeu_si256(compressed + 8, w0);
|
|
w1 = _mm256_lddqu_si256 (in + 9) ;
|
|
_mm256_storeu_si256(compressed + 9, w1);
|
|
w0 = _mm256_lddqu_si256 (in + 10) ;
|
|
_mm256_storeu_si256(compressed + 10, w0);
|
|
w1 = _mm256_lddqu_si256 (in + 11) ;
|
|
_mm256_storeu_si256(compressed + 11, w1);
|
|
w0 = _mm256_lddqu_si256 (in + 12) ;
|
|
_mm256_storeu_si256(compressed + 12, w0);
|
|
w1 = _mm256_lddqu_si256 (in + 13) ;
|
|
_mm256_storeu_si256(compressed + 13, w1);
|
|
w0 = _mm256_lddqu_si256 (in + 14) ;
|
|
_mm256_storeu_si256(compressed + 14, w0);
|
|
w1 = _mm256_lddqu_si256 (in + 15) ;
|
|
_mm256_storeu_si256(compressed + 15, w1);
|
|
w0 = _mm256_lddqu_si256 (in + 16) ;
|
|
_mm256_storeu_si256(compressed + 16, w0);
|
|
w1 = _mm256_lddqu_si256 (in + 17) ;
|
|
_mm256_storeu_si256(compressed + 17, w1);
|
|
w0 = _mm256_lddqu_si256 (in + 18) ;
|
|
_mm256_storeu_si256(compressed + 18, w0);
|
|
w1 = _mm256_lddqu_si256 (in + 19) ;
|
|
_mm256_storeu_si256(compressed + 19, w1);
|
|
w0 = _mm256_lddqu_si256 (in + 20) ;
|
|
_mm256_storeu_si256(compressed + 20, w0);
|
|
w1 = _mm256_lddqu_si256 (in + 21) ;
|
|
_mm256_storeu_si256(compressed + 21, w1);
|
|
w0 = _mm256_lddqu_si256 (in + 22) ;
|
|
_mm256_storeu_si256(compressed + 22, w0);
|
|
w1 = _mm256_lddqu_si256 (in + 23) ;
|
|
_mm256_storeu_si256(compressed + 23, w1);
|
|
w0 = _mm256_lddqu_si256 (in + 24) ;
|
|
_mm256_storeu_si256(compressed + 24, w0);
|
|
w1 = _mm256_lddqu_si256 (in + 25) ;
|
|
_mm256_storeu_si256(compressed + 25, w1);
|
|
w0 = _mm256_lddqu_si256 (in + 26) ;
|
|
_mm256_storeu_si256(compressed + 26, w0);
|
|
w1 = _mm256_lddqu_si256 (in + 27) ;
|
|
_mm256_storeu_si256(compressed + 27, w1);
|
|
w0 = _mm256_lddqu_si256 (in + 28) ;
|
|
_mm256_storeu_si256(compressed + 28, w0);
|
|
w1 = _mm256_lddqu_si256 (in + 29) ;
|
|
_mm256_storeu_si256(compressed + 29, w1);
|
|
w0 = _mm256_lddqu_si256 (in + 30) ;
|
|
_mm256_storeu_si256(compressed + 30, w0);
|
|
w1 = _mm256_lddqu_si256 (in + 31) ;
|
|
_mm256_storeu_si256(compressed + 31, w1);
|
|
}
|
|
|
|
static void avxunpackblock0(const __m256i * compressed, uint32_t * pout) {
|
|
(void) compressed;
|
|
memset(pout,0,256);
|
|
}
|
|
|
|
|
|
/* we packed 256 1-bit values, touching 1 256-bit words, using 16 bytes */
|
|
static void avxunpackblock1(const __m256i * compressed, uint32_t * pout) {
|
|
/* we are going to access 1 256-bit word */
|
|
__m256i w0;
|
|
__m256i * out = (__m256i *) pout;
|
|
const __m256i mask = _mm256_set1_epi32(1);
|
|
w0 = _mm256_lddqu_si256 (compressed);
|
|
_mm256_storeu_si256(out + 0, _mm256_and_si256 ( mask, w0 ) );
|
|
_mm256_storeu_si256(out + 1, _mm256_and_si256 ( mask, _mm256_srli_epi32( w0 , 1) ) );
|
|
_mm256_storeu_si256(out + 2, _mm256_and_si256 ( mask, _mm256_srli_epi32( w0 , 2) ) );
|
|
_mm256_storeu_si256(out + 3, _mm256_and_si256 ( mask, _mm256_srli_epi32( w0 , 3) ) );
|
|
_mm256_storeu_si256(out + 4, _mm256_and_si256 ( mask, _mm256_srli_epi32( w0 , 4) ) );
|
|
_mm256_storeu_si256(out + 5, _mm256_and_si256 ( mask, _mm256_srli_epi32( w0 , 5) ) );
|
|
_mm256_storeu_si256(out + 6, _mm256_and_si256 ( mask, _mm256_srli_epi32( w0 , 6) ) );
|
|
_mm256_storeu_si256(out + 7, _mm256_and_si256 ( mask, _mm256_srli_epi32( w0 , 7) ) );
|
|
_mm256_storeu_si256(out + 8, _mm256_and_si256 ( mask, _mm256_srli_epi32( w0 , 8) ) );
|
|
_mm256_storeu_si256(out + 9, _mm256_and_si256 ( mask, _mm256_srli_epi32( w0 , 9) ) );
|
|
_mm256_storeu_si256(out + 10, _mm256_and_si256 ( mask, _mm256_srli_epi32( w0 , 10) ) );
|
|
_mm256_storeu_si256(out + 11, _mm256_and_si256 ( mask, _mm256_srli_epi32( w0 , 11) ) );
|
|
_mm256_storeu_si256(out + 12, _mm256_and_si256 ( mask, _mm256_srli_epi32( w0 , 12) ) );
|
|
_mm256_storeu_si256(out + 13, _mm256_and_si256 ( mask, _mm256_srli_epi32( w0 , 13) ) );
|
|
_mm256_storeu_si256(out + 14, _mm256_and_si256 ( mask, _mm256_srli_epi32( w0 , 14) ) );
|
|
_mm256_storeu_si256(out + 15, _mm256_and_si256 ( mask, _mm256_srli_epi32( w0 , 15) ) );
|
|
_mm256_storeu_si256(out + 16, _mm256_and_si256 ( mask, _mm256_srli_epi32( w0 , 16) ) );
|
|
_mm256_storeu_si256(out + 17, _mm256_and_si256 ( mask, _mm256_srli_epi32( w0 , 17) ) );
|
|
_mm256_storeu_si256(out + 18, _mm256_and_si256 ( mask, _mm256_srli_epi32( w0 , 18) ) );
|
|
_mm256_storeu_si256(out + 19, _mm256_and_si256 ( mask, _mm256_srli_epi32( w0 , 19) ) );
|
|
_mm256_storeu_si256(out + 20, _mm256_and_si256 ( mask, _mm256_srli_epi32( w0 , 20) ) );
|
|
_mm256_storeu_si256(out + 21, _mm256_and_si256 ( mask, _mm256_srli_epi32( w0 , 21) ) );
|
|
_mm256_storeu_si256(out + 22, _mm256_and_si256 ( mask, _mm256_srli_epi32( w0 , 22) ) );
|
|
_mm256_storeu_si256(out + 23, _mm256_and_si256 ( mask, _mm256_srli_epi32( w0 , 23) ) );
|
|
_mm256_storeu_si256(out + 24, _mm256_and_si256 ( mask, _mm256_srli_epi32( w0 , 24) ) );
|
|
_mm256_storeu_si256(out + 25, _mm256_and_si256 ( mask, _mm256_srli_epi32( w0 , 25) ) );
|
|
_mm256_storeu_si256(out + 26, _mm256_and_si256 ( mask, _mm256_srli_epi32( w0 , 26) ) );
|
|
_mm256_storeu_si256(out + 27, _mm256_and_si256 ( mask, _mm256_srli_epi32( w0 , 27) ) );
|
|
_mm256_storeu_si256(out + 28, _mm256_and_si256 ( mask, _mm256_srli_epi32( w0 , 28) ) );
|
|
_mm256_storeu_si256(out + 29, _mm256_and_si256 ( mask, _mm256_srli_epi32( w0 , 29) ) );
|
|
_mm256_storeu_si256(out + 30, _mm256_and_si256 ( mask, _mm256_srli_epi32( w0 , 30) ) );
|
|
_mm256_storeu_si256(out + 31, _mm256_srli_epi32( w0 , 31) );
|
|
}
|
|
|
|
|
|
/* we packed 256 2-bit values, touching 2 256-bit words, using 32 bytes */
|
|
static void avxunpackblock2(const __m256i * compressed, uint32_t * pout) {
|
|
/* we are going to access 2 256-bit words */
|
|
__m256i w0, w1;
|
|
__m256i * out = (__m256i *) pout;
|
|
const __m256i mask = _mm256_set1_epi32(3);
|
|
w0 = _mm256_lddqu_si256 (compressed);
|
|
_mm256_storeu_si256(out + 0, _mm256_and_si256 ( mask, w0 ) );
|
|
_mm256_storeu_si256(out + 1, _mm256_and_si256 ( mask, _mm256_srli_epi32( w0 , 2) ) );
|
|
_mm256_storeu_si256(out + 2, _mm256_and_si256 ( mask, _mm256_srli_epi32( w0 , 4) ) );
|
|
_mm256_storeu_si256(out + 3, _mm256_and_si256 ( mask, _mm256_srli_epi32( w0 , 6) ) );
|
|
_mm256_storeu_si256(out + 4, _mm256_and_si256 ( mask, _mm256_srli_epi32( w0 , 8) ) );
|
|
_mm256_storeu_si256(out + 5, _mm256_and_si256 ( mask, _mm256_srli_epi32( w0 , 10) ) );
|
|
_mm256_storeu_si256(out + 6, _mm256_and_si256 ( mask, _mm256_srli_epi32( w0 , 12) ) );
|
|
_mm256_storeu_si256(out + 7, _mm256_and_si256 ( mask, _mm256_srli_epi32( w0 , 14) ) );
|
|
_mm256_storeu_si256(out + 8, _mm256_and_si256 ( mask, _mm256_srli_epi32( w0 , 16) ) );
|
|
_mm256_storeu_si256(out + 9, _mm256_and_si256 ( mask, _mm256_srli_epi32( w0 , 18) ) );
|
|
_mm256_storeu_si256(out + 10, _mm256_and_si256 ( mask, _mm256_srli_epi32( w0 , 20) ) );
|
|
_mm256_storeu_si256(out + 11, _mm256_and_si256 ( mask, _mm256_srli_epi32( w0 , 22) ) );
|
|
_mm256_storeu_si256(out + 12, _mm256_and_si256 ( mask, _mm256_srli_epi32( w0 , 24) ) );
|
|
_mm256_storeu_si256(out + 13, _mm256_and_si256 ( mask, _mm256_srli_epi32( w0 , 26) ) );
|
|
_mm256_storeu_si256(out + 14, _mm256_and_si256 ( mask, _mm256_srli_epi32( w0 , 28) ) );
|
|
_mm256_storeu_si256(out + 15, _mm256_srli_epi32( w0 , 30) );
|
|
w1 = _mm256_lddqu_si256 (compressed + 1);
|
|
_mm256_storeu_si256(out + 16, _mm256_and_si256 ( mask, w1 ) );
|
|
_mm256_storeu_si256(out + 17, _mm256_and_si256 ( mask, _mm256_srli_epi32( w1 , 2) ) );
|
|
_mm256_storeu_si256(out + 18, _mm256_and_si256 ( mask, _mm256_srli_epi32( w1 , 4) ) );
|
|
_mm256_storeu_si256(out + 19, _mm256_and_si256 ( mask, _mm256_srli_epi32( w1 , 6) ) );
|
|
_mm256_storeu_si256(out + 20, _mm256_and_si256 ( mask, _mm256_srli_epi32( w1 , 8) ) );
|
|
_mm256_storeu_si256(out + 21, _mm256_and_si256 ( mask, _mm256_srli_epi32( w1 , 10) ) );
|
|
_mm256_storeu_si256(out + 22, _mm256_and_si256 ( mask, _mm256_srli_epi32( w1 , 12) ) );
|
|
_mm256_storeu_si256(out + 23, _mm256_and_si256 ( mask, _mm256_srli_epi32( w1 , 14) ) );
|
|
_mm256_storeu_si256(out + 24, _mm256_and_si256 ( mask, _mm256_srli_epi32( w1 , 16) ) );
|
|
_mm256_storeu_si256(out + 25, _mm256_and_si256 ( mask, _mm256_srli_epi32( w1 , 18) ) );
|
|
_mm256_storeu_si256(out + 26, _mm256_and_si256 ( mask, _mm256_srli_epi32( w1 , 20) ) );
|
|
_mm256_storeu_si256(out + 27, _mm256_and_si256 ( mask, _mm256_srli_epi32( w1 , 22) ) );
|
|
_mm256_storeu_si256(out + 28, _mm256_and_si256 ( mask, _mm256_srli_epi32( w1 , 24) ) );
|
|
_mm256_storeu_si256(out + 29, _mm256_and_si256 ( mask, _mm256_srli_epi32( w1 , 26) ) );
|
|
_mm256_storeu_si256(out + 30, _mm256_and_si256 ( mask, _mm256_srli_epi32( w1 , 28) ) );
|
|
_mm256_storeu_si256(out + 31, _mm256_srli_epi32( w1 , 30) );
|
|
}
|
|
|
|
|
|
/* we packed 256 3-bit values, touching 3 256-bit words, using 48 bytes */
|
|
static void avxunpackblock3(const __m256i * compressed, uint32_t * pout) {
|
|
/* we are going to access 3 256-bit words */
|
|
__m256i w0, w1;
|
|
__m256i * out = (__m256i *) pout;
|
|
const __m256i mask = _mm256_set1_epi32(7);
|
|
w0 = _mm256_lddqu_si256 (compressed);
|
|
_mm256_storeu_si256(out + 0, _mm256_and_si256 ( mask, w0 ) );
|
|
_mm256_storeu_si256(out + 1, _mm256_and_si256 ( mask, _mm256_srli_epi32( w0 , 3) ) );
|
|
_mm256_storeu_si256(out + 2, _mm256_and_si256 ( mask, _mm256_srli_epi32( w0 , 6) ) );
|
|
_mm256_storeu_si256(out + 3, _mm256_and_si256 ( mask, _mm256_srli_epi32( w0 , 9) ) );
|
|
_mm256_storeu_si256(out + 4, _mm256_and_si256 ( mask, _mm256_srli_epi32( w0 , 12) ) );
|
|
_mm256_storeu_si256(out + 5, _mm256_and_si256 ( mask, _mm256_srli_epi32( w0 , 15) ) );
|
|
_mm256_storeu_si256(out + 6, _mm256_and_si256 ( mask, _mm256_srli_epi32( w0 , 18) ) );
|
|
_mm256_storeu_si256(out + 7, _mm256_and_si256 ( mask, _mm256_srli_epi32( w0 , 21) ) );
|
|
_mm256_storeu_si256(out + 8, _mm256_and_si256 ( mask, _mm256_srli_epi32( w0 , 24) ) );
|
|
_mm256_storeu_si256(out + 9, _mm256_and_si256 ( mask, _mm256_srli_epi32( w0 , 27) ) );
|
|
w1 = _mm256_lddqu_si256 (compressed + 1);
|
|
_mm256_storeu_si256(out + 10,
|
|
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w0 , 30) ,_mm256_slli_epi32( w1 , 2 ) ) ) );
|
|
_mm256_storeu_si256(out + 11, _mm256_and_si256 ( mask, _mm256_srli_epi32( w1 , 1) ) );
|
|
_mm256_storeu_si256(out + 12, _mm256_and_si256 ( mask, _mm256_srli_epi32( w1 , 4) ) );
|
|
_mm256_storeu_si256(out + 13, _mm256_and_si256 ( mask, _mm256_srli_epi32( w1 , 7) ) );
|
|
_mm256_storeu_si256(out + 14, _mm256_and_si256 ( mask, _mm256_srli_epi32( w1 , 10) ) );
|
|
_mm256_storeu_si256(out + 15, _mm256_and_si256 ( mask, _mm256_srli_epi32( w1 , 13) ) );
|
|
_mm256_storeu_si256(out + 16, _mm256_and_si256 ( mask, _mm256_srli_epi32( w1 , 16) ) );
|
|
_mm256_storeu_si256(out + 17, _mm256_and_si256 ( mask, _mm256_srli_epi32( w1 , 19) ) );
|
|
_mm256_storeu_si256(out + 18, _mm256_and_si256 ( mask, _mm256_srli_epi32( w1 , 22) ) );
|
|
_mm256_storeu_si256(out + 19, _mm256_and_si256 ( mask, _mm256_srli_epi32( w1 , 25) ) );
|
|
_mm256_storeu_si256(out + 20, _mm256_and_si256 ( mask, _mm256_srli_epi32( w1 , 28) ) );
|
|
w0 = _mm256_lddqu_si256 (compressed + 2);
|
|
_mm256_storeu_si256(out + 21,
|
|
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w1 , 31) ,_mm256_slli_epi32( w0 , 1 ) ) ) );
|
|
_mm256_storeu_si256(out + 22, _mm256_and_si256 ( mask, _mm256_srli_epi32( w0 , 2) ) );
|
|
_mm256_storeu_si256(out + 23, _mm256_and_si256 ( mask, _mm256_srli_epi32( w0 , 5) ) );
|
|
_mm256_storeu_si256(out + 24, _mm256_and_si256 ( mask, _mm256_srli_epi32( w0 , 8) ) );
|
|
_mm256_storeu_si256(out + 25, _mm256_and_si256 ( mask, _mm256_srli_epi32( w0 , 11) ) );
|
|
_mm256_storeu_si256(out + 26, _mm256_and_si256 ( mask, _mm256_srli_epi32( w0 , 14) ) );
|
|
_mm256_storeu_si256(out + 27, _mm256_and_si256 ( mask, _mm256_srli_epi32( w0 , 17) ) );
|
|
_mm256_storeu_si256(out + 28, _mm256_and_si256 ( mask, _mm256_srli_epi32( w0 , 20) ) );
|
|
_mm256_storeu_si256(out + 29, _mm256_and_si256 ( mask, _mm256_srli_epi32( w0 , 23) ) );
|
|
_mm256_storeu_si256(out + 30, _mm256_and_si256 ( mask, _mm256_srli_epi32( w0 , 26) ) );
|
|
_mm256_storeu_si256(out + 31, _mm256_srli_epi32( w0 , 29) );
|
|
}
|
|
|
|
|
|
/* we packed 256 4-bit values, touching 4 256-bit words, using 64 bytes */
|
|
static void avxunpackblock4(const __m256i * compressed, uint32_t * pout) {
|
|
/* we are going to access 4 256-bit words */
|
|
__m256i w0, w1;
|
|
__m256i * out = (__m256i *) pout;
|
|
const __m256i mask = _mm256_set1_epi32(15);
|
|
w0 = _mm256_lddqu_si256 (compressed);
|
|
_mm256_storeu_si256(out + 0, _mm256_and_si256 ( mask, w0 ) );
|
|
_mm256_storeu_si256(out + 1, _mm256_and_si256 ( mask, _mm256_srli_epi32( w0 , 4) ) );
|
|
_mm256_storeu_si256(out + 2, _mm256_and_si256 ( mask, _mm256_srli_epi32( w0 , 8) ) );
|
|
_mm256_storeu_si256(out + 3, _mm256_and_si256 ( mask, _mm256_srli_epi32( w0 , 12) ) );
|
|
_mm256_storeu_si256(out + 4, _mm256_and_si256 ( mask, _mm256_srli_epi32( w0 , 16) ) );
|
|
_mm256_storeu_si256(out + 5, _mm256_and_si256 ( mask, _mm256_srli_epi32( w0 , 20) ) );
|
|
_mm256_storeu_si256(out + 6, _mm256_and_si256 ( mask, _mm256_srli_epi32( w0 , 24) ) );
|
|
_mm256_storeu_si256(out + 7, _mm256_srli_epi32( w0 , 28) );
|
|
w1 = _mm256_lddqu_si256 (compressed + 1);
|
|
_mm256_storeu_si256(out + 8, _mm256_and_si256 ( mask, w1 ) );
|
|
_mm256_storeu_si256(out + 9, _mm256_and_si256 ( mask, _mm256_srli_epi32( w1 , 4) ) );
|
|
_mm256_storeu_si256(out + 10, _mm256_and_si256 ( mask, _mm256_srli_epi32( w1 , 8) ) );
|
|
_mm256_storeu_si256(out + 11, _mm256_and_si256 ( mask, _mm256_srli_epi32( w1 , 12) ) );
|
|
_mm256_storeu_si256(out + 12, _mm256_and_si256 ( mask, _mm256_srli_epi32( w1 , 16) ) );
|
|
_mm256_storeu_si256(out + 13, _mm256_and_si256 ( mask, _mm256_srli_epi32( w1 , 20) ) );
|
|
_mm256_storeu_si256(out + 14, _mm256_and_si256 ( mask, _mm256_srli_epi32( w1 , 24) ) );
|
|
_mm256_storeu_si256(out + 15, _mm256_srli_epi32( w1 , 28) );
|
|
w0 = _mm256_lddqu_si256 (compressed + 2);
|
|
_mm256_storeu_si256(out + 16, _mm256_and_si256 ( mask, w0 ) );
|
|
_mm256_storeu_si256(out + 17, _mm256_and_si256 ( mask, _mm256_srli_epi32( w0 , 4) ) );
|
|
_mm256_storeu_si256(out + 18, _mm256_and_si256 ( mask, _mm256_srli_epi32( w0 , 8) ) );
|
|
_mm256_storeu_si256(out + 19, _mm256_and_si256 ( mask, _mm256_srli_epi32( w0 , 12) ) );
|
|
_mm256_storeu_si256(out + 20, _mm256_and_si256 ( mask, _mm256_srli_epi32( w0 , 16) ) );
|
|
_mm256_storeu_si256(out + 21, _mm256_and_si256 ( mask, _mm256_srli_epi32( w0 , 20) ) );
|
|
_mm256_storeu_si256(out + 22, _mm256_and_si256 ( mask, _mm256_srli_epi32( w0 , 24) ) );
|
|
_mm256_storeu_si256(out + 23, _mm256_srli_epi32( w0 , 28) );
|
|
w1 = _mm256_lddqu_si256 (compressed + 3);
|
|
_mm256_storeu_si256(out + 24, _mm256_and_si256 ( mask, w1 ) );
|
|
_mm256_storeu_si256(out + 25, _mm256_and_si256 ( mask, _mm256_srli_epi32( w1 , 4) ) );
|
|
_mm256_storeu_si256(out + 26, _mm256_and_si256 ( mask, _mm256_srli_epi32( w1 , 8) ) );
|
|
_mm256_storeu_si256(out + 27, _mm256_and_si256 ( mask, _mm256_srli_epi32( w1 , 12) ) );
|
|
_mm256_storeu_si256(out + 28, _mm256_and_si256 ( mask, _mm256_srli_epi32( w1 , 16) ) );
|
|
_mm256_storeu_si256(out + 29, _mm256_and_si256 ( mask, _mm256_srli_epi32( w1 , 20) ) );
|
|
_mm256_storeu_si256(out + 30, _mm256_and_si256 ( mask, _mm256_srli_epi32( w1 , 24) ) );
|
|
_mm256_storeu_si256(out + 31, _mm256_srli_epi32( w1 , 28) );
|
|
}
|
|
|
|
|
|
/* we packed 256 5-bit values, touching 5 256-bit words, using 80 bytes */
|
|
static void avxunpackblock5(const __m256i * compressed, uint32_t * pout) {
|
|
/* we are going to access 5 256-bit words */
|
|
__m256i w0, w1;
|
|
__m256i * out = (__m256i *) pout;
|
|
const __m256i mask = _mm256_set1_epi32(31);
|
|
w0 = _mm256_lddqu_si256 (compressed);
|
|
_mm256_storeu_si256(out + 0, _mm256_and_si256 ( mask, w0 ) );
|
|
_mm256_storeu_si256(out + 1, _mm256_and_si256 ( mask, _mm256_srli_epi32( w0 , 5) ) );
|
|
_mm256_storeu_si256(out + 2, _mm256_and_si256 ( mask, _mm256_srli_epi32( w0 , 10) ) );
|
|
_mm256_storeu_si256(out + 3, _mm256_and_si256 ( mask, _mm256_srli_epi32( w0 , 15) ) );
|
|
_mm256_storeu_si256(out + 4, _mm256_and_si256 ( mask, _mm256_srli_epi32( w0 , 20) ) );
|
|
_mm256_storeu_si256(out + 5, _mm256_and_si256 ( mask, _mm256_srli_epi32( w0 , 25) ) );
|
|
w1 = _mm256_lddqu_si256 (compressed + 1);
|
|
_mm256_storeu_si256(out + 6,
|
|
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w0 , 30) ,_mm256_slli_epi32( w1 , 2 ) ) ) );
|
|
_mm256_storeu_si256(out + 7, _mm256_and_si256 ( mask, _mm256_srli_epi32( w1 , 3) ) );
|
|
_mm256_storeu_si256(out + 8, _mm256_and_si256 ( mask, _mm256_srli_epi32( w1 , 8) ) );
|
|
_mm256_storeu_si256(out + 9, _mm256_and_si256 ( mask, _mm256_srli_epi32( w1 , 13) ) );
|
|
_mm256_storeu_si256(out + 10, _mm256_and_si256 ( mask, _mm256_srli_epi32( w1 , 18) ) );
|
|
_mm256_storeu_si256(out + 11, _mm256_and_si256 ( mask, _mm256_srli_epi32( w1 , 23) ) );
|
|
w0 = _mm256_lddqu_si256 (compressed + 2);
|
|
_mm256_storeu_si256(out + 12,
|
|
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w1 , 28) ,_mm256_slli_epi32( w0 , 4 ) ) ) );
|
|
_mm256_storeu_si256(out + 13, _mm256_and_si256 ( mask, _mm256_srli_epi32( w0 , 1) ) );
|
|
_mm256_storeu_si256(out + 14, _mm256_and_si256 ( mask, _mm256_srli_epi32( w0 , 6) ) );
|
|
_mm256_storeu_si256(out + 15, _mm256_and_si256 ( mask, _mm256_srli_epi32( w0 , 11) ) );
|
|
_mm256_storeu_si256(out + 16, _mm256_and_si256 ( mask, _mm256_srli_epi32( w0 , 16) ) );
|
|
_mm256_storeu_si256(out + 17, _mm256_and_si256 ( mask, _mm256_srli_epi32( w0 , 21) ) );
|
|
_mm256_storeu_si256(out + 18, _mm256_and_si256 ( mask, _mm256_srli_epi32( w0 , 26) ) );
|
|
w1 = _mm256_lddqu_si256 (compressed + 3);
|
|
_mm256_storeu_si256(out + 19,
|
|
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w0 , 31) ,_mm256_slli_epi32( w1 , 1 ) ) ) );
|
|
_mm256_storeu_si256(out + 20, _mm256_and_si256 ( mask, _mm256_srli_epi32( w1 , 4) ) );
|
|
_mm256_storeu_si256(out + 21, _mm256_and_si256 ( mask, _mm256_srli_epi32( w1 , 9) ) );
|
|
_mm256_storeu_si256(out + 22, _mm256_and_si256 ( mask, _mm256_srli_epi32( w1 , 14) ) );
|
|
_mm256_storeu_si256(out + 23, _mm256_and_si256 ( mask, _mm256_srli_epi32( w1 , 19) ) );
|
|
_mm256_storeu_si256(out + 24, _mm256_and_si256 ( mask, _mm256_srli_epi32( w1 , 24) ) );
|
|
w0 = _mm256_lddqu_si256 (compressed + 4);
|
|
_mm256_storeu_si256(out + 25,
|
|
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w1 , 29) ,_mm256_slli_epi32( w0 , 3 ) ) ) );
|
|
_mm256_storeu_si256(out + 26, _mm256_and_si256 ( mask, _mm256_srli_epi32( w0 , 2) ) );
|
|
_mm256_storeu_si256(out + 27, _mm256_and_si256 ( mask, _mm256_srli_epi32( w0 , 7) ) );
|
|
_mm256_storeu_si256(out + 28, _mm256_and_si256 ( mask, _mm256_srli_epi32( w0 , 12) ) );
|
|
_mm256_storeu_si256(out + 29, _mm256_and_si256 ( mask, _mm256_srli_epi32( w0 , 17) ) );
|
|
_mm256_storeu_si256(out + 30, _mm256_and_si256 ( mask, _mm256_srli_epi32( w0 , 22) ) );
|
|
_mm256_storeu_si256(out + 31, _mm256_srli_epi32( w0 , 27) );
|
|
}
|
|
|
|
|
|
/* we packed 256 6-bit values, touching 6 256-bit words, using 96 bytes */
|
|
static void avxunpackblock6(const __m256i * compressed, uint32_t * pout) {
|
|
/* we are going to access 6 256-bit words */
|
|
__m256i w0, w1;
|
|
__m256i * out = (__m256i *) pout;
|
|
const __m256i mask = _mm256_set1_epi32(63);
|
|
w0 = _mm256_lddqu_si256 (compressed);
|
|
_mm256_storeu_si256(out + 0, _mm256_and_si256 ( mask, w0 ) );
|
|
_mm256_storeu_si256(out + 1, _mm256_and_si256 ( mask, _mm256_srli_epi32( w0 , 6) ) );
|
|
_mm256_storeu_si256(out + 2, _mm256_and_si256 ( mask, _mm256_srli_epi32( w0 , 12) ) );
|
|
_mm256_storeu_si256(out + 3, _mm256_and_si256 ( mask, _mm256_srli_epi32( w0 , 18) ) );
|
|
_mm256_storeu_si256(out + 4, _mm256_and_si256 ( mask, _mm256_srli_epi32( w0 , 24) ) );
|
|
w1 = _mm256_lddqu_si256 (compressed + 1);
|
|
_mm256_storeu_si256(out + 5,
|
|
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w0 , 30) ,_mm256_slli_epi32( w1 , 2 ) ) ) );
|
|
_mm256_storeu_si256(out + 6, _mm256_and_si256 ( mask, _mm256_srli_epi32( w1 , 4) ) );
|
|
_mm256_storeu_si256(out + 7, _mm256_and_si256 ( mask, _mm256_srli_epi32( w1 , 10) ) );
|
|
_mm256_storeu_si256(out + 8, _mm256_and_si256 ( mask, _mm256_srli_epi32( w1 , 16) ) );
|
|
_mm256_storeu_si256(out + 9, _mm256_and_si256 ( mask, _mm256_srli_epi32( w1 , 22) ) );
|
|
w0 = _mm256_lddqu_si256 (compressed + 2);
|
|
_mm256_storeu_si256(out + 10,
|
|
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w1 , 28) ,_mm256_slli_epi32( w0 , 4 ) ) ) );
|
|
_mm256_storeu_si256(out + 11, _mm256_and_si256 ( mask, _mm256_srli_epi32( w0 , 2) ) );
|
|
_mm256_storeu_si256(out + 12, _mm256_and_si256 ( mask, _mm256_srli_epi32( w0 , 8) ) );
|
|
_mm256_storeu_si256(out + 13, _mm256_and_si256 ( mask, _mm256_srli_epi32( w0 , 14) ) );
|
|
_mm256_storeu_si256(out + 14, _mm256_and_si256 ( mask, _mm256_srli_epi32( w0 , 20) ) );
|
|
_mm256_storeu_si256(out + 15, _mm256_srli_epi32( w0 , 26) );
|
|
w1 = _mm256_lddqu_si256 (compressed + 3);
|
|
_mm256_storeu_si256(out + 16, _mm256_and_si256 ( mask, w1 ) );
|
|
_mm256_storeu_si256(out + 17, _mm256_and_si256 ( mask, _mm256_srli_epi32( w1 , 6) ) );
|
|
_mm256_storeu_si256(out + 18, _mm256_and_si256 ( mask, _mm256_srli_epi32( w1 , 12) ) );
|
|
_mm256_storeu_si256(out + 19, _mm256_and_si256 ( mask, _mm256_srli_epi32( w1 , 18) ) );
|
|
_mm256_storeu_si256(out + 20, _mm256_and_si256 ( mask, _mm256_srli_epi32( w1 , 24) ) );
|
|
w0 = _mm256_lddqu_si256 (compressed + 4);
|
|
_mm256_storeu_si256(out + 21,
|
|
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w1 , 30) ,_mm256_slli_epi32( w0 , 2 ) ) ) );
|
|
_mm256_storeu_si256(out + 22, _mm256_and_si256 ( mask, _mm256_srli_epi32( w0 , 4) ) );
|
|
_mm256_storeu_si256(out + 23, _mm256_and_si256 ( mask, _mm256_srli_epi32( w0 , 10) ) );
|
|
_mm256_storeu_si256(out + 24, _mm256_and_si256 ( mask, _mm256_srli_epi32( w0 , 16) ) );
|
|
_mm256_storeu_si256(out + 25, _mm256_and_si256 ( mask, _mm256_srli_epi32( w0 , 22) ) );
|
|
w1 = _mm256_lddqu_si256 (compressed + 5);
|
|
_mm256_storeu_si256(out + 26,
|
|
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w0 , 28) ,_mm256_slli_epi32( w1 , 4 ) ) ) );
|
|
_mm256_storeu_si256(out + 27, _mm256_and_si256 ( mask, _mm256_srli_epi32( w1 , 2) ) );
|
|
_mm256_storeu_si256(out + 28, _mm256_and_si256 ( mask, _mm256_srli_epi32( w1 , 8) ) );
|
|
_mm256_storeu_si256(out + 29, _mm256_and_si256 ( mask, _mm256_srli_epi32( w1 , 14) ) );
|
|
_mm256_storeu_si256(out + 30, _mm256_and_si256 ( mask, _mm256_srli_epi32( w1 , 20) ) );
|
|
_mm256_storeu_si256(out + 31, _mm256_srli_epi32( w1 , 26) );
|
|
}
|
|
|
|
|
|
/* we packed 256 7-bit values, touching 7 256-bit words, using 112 bytes */
|
|
static void avxunpackblock7(const __m256i * compressed, uint32_t * pout) {
|
|
/* we are going to access 7 256-bit words */
|
|
__m256i w0, w1;
|
|
__m256i * out = (__m256i *) pout;
|
|
const __m256i mask = _mm256_set1_epi32(127);
|
|
w0 = _mm256_lddqu_si256 (compressed);
|
|
_mm256_storeu_si256(out + 0, _mm256_and_si256 ( mask, w0 ) );
|
|
_mm256_storeu_si256(out + 1, _mm256_and_si256 ( mask, _mm256_srli_epi32( w0 , 7) ) );
|
|
_mm256_storeu_si256(out + 2, _mm256_and_si256 ( mask, _mm256_srli_epi32( w0 , 14) ) );
|
|
_mm256_storeu_si256(out + 3, _mm256_and_si256 ( mask, _mm256_srli_epi32( w0 , 21) ) );
|
|
w1 = _mm256_lddqu_si256 (compressed + 1);
|
|
_mm256_storeu_si256(out + 4,
|
|
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w0 , 28) ,_mm256_slli_epi32( w1 , 4 ) ) ) );
|
|
_mm256_storeu_si256(out + 5, _mm256_and_si256 ( mask, _mm256_srli_epi32( w1 , 3) ) );
|
|
_mm256_storeu_si256(out + 6, _mm256_and_si256 ( mask, _mm256_srli_epi32( w1 , 10) ) );
|
|
_mm256_storeu_si256(out + 7, _mm256_and_si256 ( mask, _mm256_srli_epi32( w1 , 17) ) );
|
|
_mm256_storeu_si256(out + 8, _mm256_and_si256 ( mask, _mm256_srli_epi32( w1 , 24) ) );
|
|
w0 = _mm256_lddqu_si256 (compressed + 2);
|
|
_mm256_storeu_si256(out + 9,
|
|
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w1 , 31) ,_mm256_slli_epi32( w0 , 1 ) ) ) );
|
|
_mm256_storeu_si256(out + 10, _mm256_and_si256 ( mask, _mm256_srli_epi32( w0 , 6) ) );
|
|
_mm256_storeu_si256(out + 11, _mm256_and_si256 ( mask, _mm256_srli_epi32( w0 , 13) ) );
|
|
_mm256_storeu_si256(out + 12, _mm256_and_si256 ( mask, _mm256_srli_epi32( w0 , 20) ) );
|
|
w1 = _mm256_lddqu_si256 (compressed + 3);
|
|
_mm256_storeu_si256(out + 13,
|
|
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w0 , 27) ,_mm256_slli_epi32( w1 , 5 ) ) ) );
|
|
_mm256_storeu_si256(out + 14, _mm256_and_si256 ( mask, _mm256_srli_epi32( w1 , 2) ) );
|
|
_mm256_storeu_si256(out + 15, _mm256_and_si256 ( mask, _mm256_srli_epi32( w1 , 9) ) );
|
|
_mm256_storeu_si256(out + 16, _mm256_and_si256 ( mask, _mm256_srli_epi32( w1 , 16) ) );
|
|
_mm256_storeu_si256(out + 17, _mm256_and_si256 ( mask, _mm256_srli_epi32( w1 , 23) ) );
|
|
w0 = _mm256_lddqu_si256 (compressed + 4);
|
|
_mm256_storeu_si256(out + 18,
|
|
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w1 , 30) ,_mm256_slli_epi32( w0 , 2 ) ) ) );
|
|
_mm256_storeu_si256(out + 19, _mm256_and_si256 ( mask, _mm256_srli_epi32( w0 , 5) ) );
|
|
_mm256_storeu_si256(out + 20, _mm256_and_si256 ( mask, _mm256_srli_epi32( w0 , 12) ) );
|
|
_mm256_storeu_si256(out + 21, _mm256_and_si256 ( mask, _mm256_srli_epi32( w0 , 19) ) );
|
|
w1 = _mm256_lddqu_si256 (compressed + 5);
|
|
_mm256_storeu_si256(out + 22,
|
|
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w0 , 26) ,_mm256_slli_epi32( w1 , 6 ) ) ) );
|
|
_mm256_storeu_si256(out + 23, _mm256_and_si256 ( mask, _mm256_srli_epi32( w1 , 1) ) );
|
|
_mm256_storeu_si256(out + 24, _mm256_and_si256 ( mask, _mm256_srli_epi32( w1 , 8) ) );
|
|
_mm256_storeu_si256(out + 25, _mm256_and_si256 ( mask, _mm256_srli_epi32( w1 , 15) ) );
|
|
_mm256_storeu_si256(out + 26, _mm256_and_si256 ( mask, _mm256_srli_epi32( w1 , 22) ) );
|
|
w0 = _mm256_lddqu_si256 (compressed + 6);
|
|
_mm256_storeu_si256(out + 27,
|
|
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w1 , 29) ,_mm256_slli_epi32( w0 , 3 ) ) ) );
|
|
_mm256_storeu_si256(out + 28, _mm256_and_si256 ( mask, _mm256_srli_epi32( w0 , 4) ) );
|
|
_mm256_storeu_si256(out + 29, _mm256_and_si256 ( mask, _mm256_srli_epi32( w0 , 11) ) );
|
|
_mm256_storeu_si256(out + 30, _mm256_and_si256 ( mask, _mm256_srli_epi32( w0 , 18) ) );
|
|
_mm256_storeu_si256(out + 31, _mm256_srli_epi32( w0 , 25) );
|
|
}
|
|
|
|
|
|
/* we packed 256 8-bit values, touching 8 256-bit words, using 128 bytes */
|
|
static void avxunpackblock8(const __m256i * compressed, uint32_t * pout) {
|
|
/* we are going to access 8 256-bit words */
|
|
__m256i w0, w1;
|
|
__m256i * out = (__m256i *) pout;
|
|
const __m256i mask = _mm256_set1_epi32(255);
|
|
w0 = _mm256_lddqu_si256 (compressed);
|
|
_mm256_storeu_si256(out + 0, _mm256_and_si256 ( mask, w0 ) );
|
|
_mm256_storeu_si256(out + 1, _mm256_and_si256 ( mask, _mm256_srli_epi32( w0 , 8) ) );
|
|
_mm256_storeu_si256(out + 2, _mm256_and_si256 ( mask, _mm256_srli_epi32( w0 , 16) ) );
|
|
_mm256_storeu_si256(out + 3, _mm256_srli_epi32( w0 , 24) );
|
|
w1 = _mm256_lddqu_si256 (compressed + 1);
|
|
_mm256_storeu_si256(out + 4, _mm256_and_si256 ( mask, w1 ) );
|
|
_mm256_storeu_si256(out + 5, _mm256_and_si256 ( mask, _mm256_srli_epi32( w1 , 8) ) );
|
|
_mm256_storeu_si256(out + 6, _mm256_and_si256 ( mask, _mm256_srli_epi32( w1 , 16) ) );
|
|
_mm256_storeu_si256(out + 7, _mm256_srli_epi32( w1 , 24) );
|
|
w0 = _mm256_lddqu_si256 (compressed + 2);
|
|
_mm256_storeu_si256(out + 8, _mm256_and_si256 ( mask, w0 ) );
|
|
_mm256_storeu_si256(out + 9, _mm256_and_si256 ( mask, _mm256_srli_epi32( w0 , 8) ) );
|
|
_mm256_storeu_si256(out + 10, _mm256_and_si256 ( mask, _mm256_srli_epi32( w0 , 16) ) );
|
|
_mm256_storeu_si256(out + 11, _mm256_srli_epi32( w0 , 24) );
|
|
w1 = _mm256_lddqu_si256 (compressed + 3);
|
|
_mm256_storeu_si256(out + 12, _mm256_and_si256 ( mask, w1 ) );
|
|
_mm256_storeu_si256(out + 13, _mm256_and_si256 ( mask, _mm256_srli_epi32( w1 , 8) ) );
|
|
_mm256_storeu_si256(out + 14, _mm256_and_si256 ( mask, _mm256_srli_epi32( w1 , 16) ) );
|
|
_mm256_storeu_si256(out + 15, _mm256_srli_epi32( w1 , 24) );
|
|
w0 = _mm256_lddqu_si256 (compressed + 4);
|
|
_mm256_storeu_si256(out + 16, _mm256_and_si256 ( mask, w0 ) );
|
|
_mm256_storeu_si256(out + 17, _mm256_and_si256 ( mask, _mm256_srli_epi32( w0 , 8) ) );
|
|
_mm256_storeu_si256(out + 18, _mm256_and_si256 ( mask, _mm256_srli_epi32( w0 , 16) ) );
|
|
_mm256_storeu_si256(out + 19, _mm256_srli_epi32( w0 , 24) );
|
|
w1 = _mm256_lddqu_si256 (compressed + 5);
|
|
_mm256_storeu_si256(out + 20, _mm256_and_si256 ( mask, w1 ) );
|
|
_mm256_storeu_si256(out + 21, _mm256_and_si256 ( mask, _mm256_srli_epi32( w1 , 8) ) );
|
|
_mm256_storeu_si256(out + 22, _mm256_and_si256 ( mask, _mm256_srli_epi32( w1 , 16) ) );
|
|
_mm256_storeu_si256(out + 23, _mm256_srli_epi32( w1 , 24) );
|
|
w0 = _mm256_lddqu_si256 (compressed + 6);
|
|
_mm256_storeu_si256(out + 24, _mm256_and_si256 ( mask, w0 ) );
|
|
_mm256_storeu_si256(out + 25, _mm256_and_si256 ( mask, _mm256_srli_epi32( w0 , 8) ) );
|
|
_mm256_storeu_si256(out + 26, _mm256_and_si256 ( mask, _mm256_srli_epi32( w0 , 16) ) );
|
|
_mm256_storeu_si256(out + 27, _mm256_srli_epi32( w0 , 24) );
|
|
w1 = _mm256_lddqu_si256 (compressed + 7);
|
|
_mm256_storeu_si256(out + 28, _mm256_and_si256 ( mask, w1 ) );
|
|
_mm256_storeu_si256(out + 29, _mm256_and_si256 ( mask, _mm256_srli_epi32( w1 , 8) ) );
|
|
_mm256_storeu_si256(out + 30, _mm256_and_si256 ( mask, _mm256_srli_epi32( w1 , 16) ) );
|
|
_mm256_storeu_si256(out + 31, _mm256_srli_epi32( w1 , 24) );
|
|
}
|
|
|
|
|
|
/* we packed 256 9-bit values, touching 9 256-bit words, using 144 bytes */
|
|
static void avxunpackblock9(const __m256i * compressed, uint32_t * pout) {
|
|
/* we are going to access 9 256-bit words */
|
|
__m256i w0, w1;
|
|
__m256i * out = (__m256i *) pout;
|
|
const __m256i mask = _mm256_set1_epi32(511);
|
|
w0 = _mm256_lddqu_si256 (compressed);
|
|
_mm256_storeu_si256(out + 0, _mm256_and_si256 ( mask, w0 ) );
|
|
_mm256_storeu_si256(out + 1, _mm256_and_si256 ( mask, _mm256_srli_epi32( w0 , 9) ) );
|
|
_mm256_storeu_si256(out + 2, _mm256_and_si256 ( mask, _mm256_srli_epi32( w0 , 18) ) );
|
|
w1 = _mm256_lddqu_si256 (compressed + 1);
|
|
_mm256_storeu_si256(out + 3,
|
|
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w0 , 27) ,_mm256_slli_epi32( w1 , 5 ) ) ) );
|
|
_mm256_storeu_si256(out + 4, _mm256_and_si256 ( mask, _mm256_srli_epi32( w1 , 4) ) );
|
|
_mm256_storeu_si256(out + 5, _mm256_and_si256 ( mask, _mm256_srli_epi32( w1 , 13) ) );
|
|
_mm256_storeu_si256(out + 6, _mm256_and_si256 ( mask, _mm256_srli_epi32( w1 , 22) ) );
|
|
w0 = _mm256_lddqu_si256 (compressed + 2);
|
|
_mm256_storeu_si256(out + 7,
|
|
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w1 , 31) ,_mm256_slli_epi32( w0 , 1 ) ) ) );
|
|
_mm256_storeu_si256(out + 8, _mm256_and_si256 ( mask, _mm256_srli_epi32( w0 , 8) ) );
|
|
_mm256_storeu_si256(out + 9, _mm256_and_si256 ( mask, _mm256_srli_epi32( w0 , 17) ) );
|
|
w1 = _mm256_lddqu_si256 (compressed + 3);
|
|
_mm256_storeu_si256(out + 10,
|
|
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w0 , 26) ,_mm256_slli_epi32( w1 , 6 ) ) ) );
|
|
_mm256_storeu_si256(out + 11, _mm256_and_si256 ( mask, _mm256_srli_epi32( w1 , 3) ) );
|
|
_mm256_storeu_si256(out + 12, _mm256_and_si256 ( mask, _mm256_srli_epi32( w1 , 12) ) );
|
|
_mm256_storeu_si256(out + 13, _mm256_and_si256 ( mask, _mm256_srli_epi32( w1 , 21) ) );
|
|
w0 = _mm256_lddqu_si256 (compressed + 4);
|
|
_mm256_storeu_si256(out + 14,
|
|
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w1 , 30) ,_mm256_slli_epi32( w0 , 2 ) ) ) );
|
|
_mm256_storeu_si256(out + 15, _mm256_and_si256 ( mask, _mm256_srli_epi32( w0 , 7) ) );
|
|
_mm256_storeu_si256(out + 16, _mm256_and_si256 ( mask, _mm256_srli_epi32( w0 , 16) ) );
|
|
w1 = _mm256_lddqu_si256 (compressed + 5);
|
|
_mm256_storeu_si256(out + 17,
|
|
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w0 , 25) ,_mm256_slli_epi32( w1 , 7 ) ) ) );
|
|
_mm256_storeu_si256(out + 18, _mm256_and_si256 ( mask, _mm256_srli_epi32( w1 , 2) ) );
|
|
_mm256_storeu_si256(out + 19, _mm256_and_si256 ( mask, _mm256_srli_epi32( w1 , 11) ) );
|
|
_mm256_storeu_si256(out + 20, _mm256_and_si256 ( mask, _mm256_srli_epi32( w1 , 20) ) );
|
|
w0 = _mm256_lddqu_si256 (compressed + 6);
|
|
_mm256_storeu_si256(out + 21,
|
|
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w1 , 29) ,_mm256_slli_epi32( w0 , 3 ) ) ) );
|
|
_mm256_storeu_si256(out + 22, _mm256_and_si256 ( mask, _mm256_srli_epi32( w0 , 6) ) );
|
|
_mm256_storeu_si256(out + 23, _mm256_and_si256 ( mask, _mm256_srli_epi32( w0 , 15) ) );
|
|
w1 = _mm256_lddqu_si256 (compressed + 7);
|
|
_mm256_storeu_si256(out + 24,
|
|
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w0 , 24) ,_mm256_slli_epi32( w1 , 8 ) ) ) );
|
|
_mm256_storeu_si256(out + 25, _mm256_and_si256 ( mask, _mm256_srli_epi32( w1 , 1) ) );
|
|
_mm256_storeu_si256(out + 26, _mm256_and_si256 ( mask, _mm256_srli_epi32( w1 , 10) ) );
|
|
_mm256_storeu_si256(out + 27, _mm256_and_si256 ( mask, _mm256_srli_epi32( w1 , 19) ) );
|
|
w0 = _mm256_lddqu_si256 (compressed + 8);
|
|
_mm256_storeu_si256(out + 28,
|
|
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w1 , 28) ,_mm256_slli_epi32( w0 , 4 ) ) ) );
|
|
_mm256_storeu_si256(out + 29, _mm256_and_si256 ( mask, _mm256_srli_epi32( w0 , 5) ) );
|
|
_mm256_storeu_si256(out + 30, _mm256_and_si256 ( mask, _mm256_srli_epi32( w0 , 14) ) );
|
|
_mm256_storeu_si256(out + 31, _mm256_srli_epi32( w0 , 23) );
|
|
}
|
|
|
|
|
|
/* we packed 256 10-bit values, touching 10 256-bit words, using 160 bytes */
|
|
static void avxunpackblock10(const __m256i * compressed, uint32_t * pout) {
|
|
/* we are going to access 10 256-bit words */
|
|
__m256i w0, w1;
|
|
__m256i * out = (__m256i *) pout;
|
|
const __m256i mask = _mm256_set1_epi32(1023);
|
|
w0 = _mm256_lddqu_si256 (compressed);
|
|
_mm256_storeu_si256(out + 0, _mm256_and_si256 ( mask, w0 ) );
|
|
_mm256_storeu_si256(out + 1, _mm256_and_si256 ( mask, _mm256_srli_epi32( w0 , 10) ) );
|
|
_mm256_storeu_si256(out + 2, _mm256_and_si256 ( mask, _mm256_srli_epi32( w0 , 20) ) );
|
|
w1 = _mm256_lddqu_si256 (compressed + 1);
|
|
_mm256_storeu_si256(out + 3,
|
|
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w0 , 30) ,_mm256_slli_epi32( w1 , 2 ) ) ) );
|
|
_mm256_storeu_si256(out + 4, _mm256_and_si256 ( mask, _mm256_srli_epi32( w1 , 8) ) );
|
|
_mm256_storeu_si256(out + 5, _mm256_and_si256 ( mask, _mm256_srli_epi32( w1 , 18) ) );
|
|
w0 = _mm256_lddqu_si256 (compressed + 2);
|
|
_mm256_storeu_si256(out + 6,
|
|
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w1 , 28) ,_mm256_slli_epi32( w0 , 4 ) ) ) );
|
|
_mm256_storeu_si256(out + 7, _mm256_and_si256 ( mask, _mm256_srli_epi32( w0 , 6) ) );
|
|
_mm256_storeu_si256(out + 8, _mm256_and_si256 ( mask, _mm256_srli_epi32( w0 , 16) ) );
|
|
w1 = _mm256_lddqu_si256 (compressed + 3);
|
|
_mm256_storeu_si256(out + 9,
|
|
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w0 , 26) ,_mm256_slli_epi32( w1 , 6 ) ) ) );
|
|
_mm256_storeu_si256(out + 10, _mm256_and_si256 ( mask, _mm256_srli_epi32( w1 , 4) ) );
|
|
_mm256_storeu_si256(out + 11, _mm256_and_si256 ( mask, _mm256_srli_epi32( w1 , 14) ) );
|
|
w0 = _mm256_lddqu_si256 (compressed + 4);
|
|
_mm256_storeu_si256(out + 12,
|
|
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w1 , 24) ,_mm256_slli_epi32( w0 , 8 ) ) ) );
|
|
_mm256_storeu_si256(out + 13, _mm256_and_si256 ( mask, _mm256_srli_epi32( w0 , 2) ) );
|
|
_mm256_storeu_si256(out + 14, _mm256_and_si256 ( mask, _mm256_srli_epi32( w0 , 12) ) );
|
|
_mm256_storeu_si256(out + 15, _mm256_srli_epi32( w0 , 22) );
|
|
w1 = _mm256_lddqu_si256 (compressed + 5);
|
|
_mm256_storeu_si256(out + 16, _mm256_and_si256 ( mask, w1 ) );
|
|
_mm256_storeu_si256(out + 17, _mm256_and_si256 ( mask, _mm256_srli_epi32( w1 , 10) ) );
|
|
_mm256_storeu_si256(out + 18, _mm256_and_si256 ( mask, _mm256_srli_epi32( w1 , 20) ) );
|
|
w0 = _mm256_lddqu_si256 (compressed + 6);
|
|
_mm256_storeu_si256(out + 19,
|
|
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w1 , 30) ,_mm256_slli_epi32( w0 , 2 ) ) ) );
|
|
_mm256_storeu_si256(out + 20, _mm256_and_si256 ( mask, _mm256_srli_epi32( w0 , 8) ) );
|
|
_mm256_storeu_si256(out + 21, _mm256_and_si256 ( mask, _mm256_srli_epi32( w0 , 18) ) );
|
|
w1 = _mm256_lddqu_si256 (compressed + 7);
|
|
_mm256_storeu_si256(out + 22,
|
|
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w0 , 28) ,_mm256_slli_epi32( w1 , 4 ) ) ) );
|
|
_mm256_storeu_si256(out + 23, _mm256_and_si256 ( mask, _mm256_srli_epi32( w1 , 6) ) );
|
|
_mm256_storeu_si256(out + 24, _mm256_and_si256 ( mask, _mm256_srli_epi32( w1 , 16) ) );
|
|
w0 = _mm256_lddqu_si256 (compressed + 8);
|
|
_mm256_storeu_si256(out + 25,
|
|
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w1 , 26) ,_mm256_slli_epi32( w0 , 6 ) ) ) );
|
|
_mm256_storeu_si256(out + 26, _mm256_and_si256 ( mask, _mm256_srli_epi32( w0 , 4) ) );
|
|
_mm256_storeu_si256(out + 27, _mm256_and_si256 ( mask, _mm256_srli_epi32( w0 , 14) ) );
|
|
w1 = _mm256_lddqu_si256 (compressed + 9);
|
|
_mm256_storeu_si256(out + 28,
|
|
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w0 , 24) ,_mm256_slli_epi32( w1 , 8 ) ) ) );
|
|
_mm256_storeu_si256(out + 29, _mm256_and_si256 ( mask, _mm256_srli_epi32( w1 , 2) ) );
|
|
_mm256_storeu_si256(out + 30, _mm256_and_si256 ( mask, _mm256_srli_epi32( w1 , 12) ) );
|
|
_mm256_storeu_si256(out + 31, _mm256_srli_epi32( w1 , 22) );
|
|
}
|
|
|
|
|
|
/* we packed 256 11-bit values, touching 11 256-bit words, using 176 bytes */
|
|
static void avxunpackblock11(const __m256i * compressed, uint32_t * pout) {
|
|
/* we are going to access 11 256-bit words */
|
|
__m256i w0, w1;
|
|
__m256i * out = (__m256i *) pout;
|
|
const __m256i mask = _mm256_set1_epi32(2047);
|
|
w0 = _mm256_lddqu_si256 (compressed);
|
|
_mm256_storeu_si256(out + 0, _mm256_and_si256 ( mask, w0 ) );
|
|
_mm256_storeu_si256(out + 1, _mm256_and_si256 ( mask, _mm256_srli_epi32( w0 , 11) ) );
|
|
w1 = _mm256_lddqu_si256 (compressed + 1);
|
|
_mm256_storeu_si256(out + 2,
|
|
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w0 , 22) ,_mm256_slli_epi32( w1 , 10 ) ) ) );
|
|
_mm256_storeu_si256(out + 3, _mm256_and_si256 ( mask, _mm256_srli_epi32( w1 , 1) ) );
|
|
_mm256_storeu_si256(out + 4, _mm256_and_si256 ( mask, _mm256_srli_epi32( w1 , 12) ) );
|
|
w0 = _mm256_lddqu_si256 (compressed + 2);
|
|
_mm256_storeu_si256(out + 5,
|
|
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w1 , 23) ,_mm256_slli_epi32( w0 , 9 ) ) ) );
|
|
_mm256_storeu_si256(out + 6, _mm256_and_si256 ( mask, _mm256_srli_epi32( w0 , 2) ) );
|
|
_mm256_storeu_si256(out + 7, _mm256_and_si256 ( mask, _mm256_srli_epi32( w0 , 13) ) );
|
|
w1 = _mm256_lddqu_si256 (compressed + 3);
|
|
_mm256_storeu_si256(out + 8,
|
|
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w0 , 24) ,_mm256_slli_epi32( w1 , 8 ) ) ) );
|
|
_mm256_storeu_si256(out + 9, _mm256_and_si256 ( mask, _mm256_srli_epi32( w1 , 3) ) );
|
|
_mm256_storeu_si256(out + 10, _mm256_and_si256 ( mask, _mm256_srli_epi32( w1 , 14) ) );
|
|
w0 = _mm256_lddqu_si256 (compressed + 4);
|
|
_mm256_storeu_si256(out + 11,
|
|
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w1 , 25) ,_mm256_slli_epi32( w0 , 7 ) ) ) );
|
|
_mm256_storeu_si256(out + 12, _mm256_and_si256 ( mask, _mm256_srli_epi32( w0 , 4) ) );
|
|
_mm256_storeu_si256(out + 13, _mm256_and_si256 ( mask, _mm256_srli_epi32( w0 , 15) ) );
|
|
w1 = _mm256_lddqu_si256 (compressed + 5);
|
|
_mm256_storeu_si256(out + 14,
|
|
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w0 , 26) ,_mm256_slli_epi32( w1 , 6 ) ) ) );
|
|
_mm256_storeu_si256(out + 15, _mm256_and_si256 ( mask, _mm256_srli_epi32( w1 , 5) ) );
|
|
_mm256_storeu_si256(out + 16, _mm256_and_si256 ( mask, _mm256_srli_epi32( w1 , 16) ) );
|
|
w0 = _mm256_lddqu_si256 (compressed + 6);
|
|
_mm256_storeu_si256(out + 17,
|
|
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w1 , 27) ,_mm256_slli_epi32( w0 , 5 ) ) ) );
|
|
_mm256_storeu_si256(out + 18, _mm256_and_si256 ( mask, _mm256_srli_epi32( w0 , 6) ) );
|
|
_mm256_storeu_si256(out + 19, _mm256_and_si256 ( mask, _mm256_srli_epi32( w0 , 17) ) );
|
|
w1 = _mm256_lddqu_si256 (compressed + 7);
|
|
_mm256_storeu_si256(out + 20,
|
|
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w0 , 28) ,_mm256_slli_epi32( w1 , 4 ) ) ) );
|
|
_mm256_storeu_si256(out + 21, _mm256_and_si256 ( mask, _mm256_srli_epi32( w1 , 7) ) );
|
|
_mm256_storeu_si256(out + 22, _mm256_and_si256 ( mask, _mm256_srli_epi32( w1 , 18) ) );
|
|
w0 = _mm256_lddqu_si256 (compressed + 8);
|
|
_mm256_storeu_si256(out + 23,
|
|
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w1 , 29) ,_mm256_slli_epi32( w0 , 3 ) ) ) );
|
|
_mm256_storeu_si256(out + 24, _mm256_and_si256 ( mask, _mm256_srli_epi32( w0 , 8) ) );
|
|
_mm256_storeu_si256(out + 25, _mm256_and_si256 ( mask, _mm256_srli_epi32( w0 , 19) ) );
|
|
w1 = _mm256_lddqu_si256 (compressed + 9);
|
|
_mm256_storeu_si256(out + 26,
|
|
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w0 , 30) ,_mm256_slli_epi32( w1 , 2 ) ) ) );
|
|
_mm256_storeu_si256(out + 27, _mm256_and_si256 ( mask, _mm256_srli_epi32( w1 , 9) ) );
|
|
_mm256_storeu_si256(out + 28, _mm256_and_si256 ( mask, _mm256_srli_epi32( w1 , 20) ) );
|
|
w0 = _mm256_lddqu_si256 (compressed + 10);
|
|
_mm256_storeu_si256(out + 29,
|
|
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w1 , 31) ,_mm256_slli_epi32( w0 , 1 ) ) ) );
|
|
_mm256_storeu_si256(out + 30, _mm256_and_si256 ( mask, _mm256_srli_epi32( w0 , 10) ) );
|
|
_mm256_storeu_si256(out + 31, _mm256_srli_epi32( w0 , 21) );
|
|
}
|
|
|
|
|
|
/* we packed 256 12-bit values, touching 12 256-bit words, using 192 bytes */
|
|
static void avxunpackblock12(const __m256i * compressed, uint32_t * pout) {
|
|
/* we are going to access 12 256-bit words */
|
|
__m256i w0, w1;
|
|
__m256i * out = (__m256i *) pout;
|
|
const __m256i mask = _mm256_set1_epi32(4095);
|
|
w0 = _mm256_lddqu_si256 (compressed);
|
|
_mm256_storeu_si256(out + 0, _mm256_and_si256 ( mask, w0 ) );
|
|
_mm256_storeu_si256(out + 1, _mm256_and_si256 ( mask, _mm256_srli_epi32( w0 , 12) ) );
|
|
w1 = _mm256_lddqu_si256 (compressed + 1);
|
|
_mm256_storeu_si256(out + 2,
|
|
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w0 , 24) ,_mm256_slli_epi32( w1 , 8 ) ) ) );
|
|
_mm256_storeu_si256(out + 3, _mm256_and_si256 ( mask, _mm256_srli_epi32( w1 , 4) ) );
|
|
_mm256_storeu_si256(out + 4, _mm256_and_si256 ( mask, _mm256_srli_epi32( w1 , 16) ) );
|
|
w0 = _mm256_lddqu_si256 (compressed + 2);
|
|
_mm256_storeu_si256(out + 5,
|
|
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w1 , 28) ,_mm256_slli_epi32( w0 , 4 ) ) ) );
|
|
_mm256_storeu_si256(out + 6, _mm256_and_si256 ( mask, _mm256_srli_epi32( w0 , 8) ) );
|
|
_mm256_storeu_si256(out + 7, _mm256_srli_epi32( w0 , 20) );
|
|
w1 = _mm256_lddqu_si256 (compressed + 3);
|
|
_mm256_storeu_si256(out + 8, _mm256_and_si256 ( mask, w1 ) );
|
|
_mm256_storeu_si256(out + 9, _mm256_and_si256 ( mask, _mm256_srli_epi32( w1 , 12) ) );
|
|
w0 = _mm256_lddqu_si256 (compressed + 4);
|
|
_mm256_storeu_si256(out + 10,
|
|
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w1 , 24) ,_mm256_slli_epi32( w0 , 8 ) ) ) );
|
|
_mm256_storeu_si256(out + 11, _mm256_and_si256 ( mask, _mm256_srli_epi32( w0 , 4) ) );
|
|
_mm256_storeu_si256(out + 12, _mm256_and_si256 ( mask, _mm256_srli_epi32( w0 , 16) ) );
|
|
w1 = _mm256_lddqu_si256 (compressed + 5);
|
|
_mm256_storeu_si256(out + 13,
|
|
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w0 , 28) ,_mm256_slli_epi32( w1 , 4 ) ) ) );
|
|
_mm256_storeu_si256(out + 14, _mm256_and_si256 ( mask, _mm256_srli_epi32( w1 , 8) ) );
|
|
_mm256_storeu_si256(out + 15, _mm256_srli_epi32( w1 , 20) );
|
|
w0 = _mm256_lddqu_si256 (compressed + 6);
|
|
_mm256_storeu_si256(out + 16, _mm256_and_si256 ( mask, w0 ) );
|
|
_mm256_storeu_si256(out + 17, _mm256_and_si256 ( mask, _mm256_srli_epi32( w0 , 12) ) );
|
|
w1 = _mm256_lddqu_si256 (compressed + 7);
|
|
_mm256_storeu_si256(out + 18,
|
|
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w0 , 24) ,_mm256_slli_epi32( w1 , 8 ) ) ) );
|
|
_mm256_storeu_si256(out + 19, _mm256_and_si256 ( mask, _mm256_srli_epi32( w1 , 4) ) );
|
|
_mm256_storeu_si256(out + 20, _mm256_and_si256 ( mask, _mm256_srli_epi32( w1 , 16) ) );
|
|
w0 = _mm256_lddqu_si256 (compressed + 8);
|
|
_mm256_storeu_si256(out + 21,
|
|
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w1 , 28) ,_mm256_slli_epi32( w0 , 4 ) ) ) );
|
|
_mm256_storeu_si256(out + 22, _mm256_and_si256 ( mask, _mm256_srli_epi32( w0 , 8) ) );
|
|
_mm256_storeu_si256(out + 23, _mm256_srli_epi32( w0 , 20) );
|
|
w1 = _mm256_lddqu_si256 (compressed + 9);
|
|
_mm256_storeu_si256(out + 24, _mm256_and_si256 ( mask, w1 ) );
|
|
_mm256_storeu_si256(out + 25, _mm256_and_si256 ( mask, _mm256_srli_epi32( w1 , 12) ) );
|
|
w0 = _mm256_lddqu_si256 (compressed + 10);
|
|
_mm256_storeu_si256(out + 26,
|
|
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w1 , 24) ,_mm256_slli_epi32( w0 , 8 ) ) ) );
|
|
_mm256_storeu_si256(out + 27, _mm256_and_si256 ( mask, _mm256_srli_epi32( w0 , 4) ) );
|
|
_mm256_storeu_si256(out + 28, _mm256_and_si256 ( mask, _mm256_srli_epi32( w0 , 16) ) );
|
|
w1 = _mm256_lddqu_si256 (compressed + 11);
|
|
_mm256_storeu_si256(out + 29,
|
|
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w0 , 28) ,_mm256_slli_epi32( w1 , 4 ) ) ) );
|
|
_mm256_storeu_si256(out + 30, _mm256_and_si256 ( mask, _mm256_srli_epi32( w1 , 8) ) );
|
|
_mm256_storeu_si256(out + 31, _mm256_srli_epi32( w1 , 20) );
|
|
}
|
|
|
|
|
|
/* we packed 256 13-bit values, touching 13 256-bit words, using 208 bytes */
|
|
static void avxunpackblock13(const __m256i * compressed, uint32_t * pout) {
|
|
/* we are going to access 13 256-bit words */
|
|
__m256i w0, w1;
|
|
__m256i * out = (__m256i *) pout;
|
|
const __m256i mask = _mm256_set1_epi32(8191);
|
|
w0 = _mm256_lddqu_si256 (compressed);
|
|
_mm256_storeu_si256(out + 0, _mm256_and_si256 ( mask, w0 ) );
|
|
_mm256_storeu_si256(out + 1, _mm256_and_si256 ( mask, _mm256_srli_epi32( w0 , 13) ) );
|
|
w1 = _mm256_lddqu_si256 (compressed + 1);
|
|
_mm256_storeu_si256(out + 2,
|
|
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w0 , 26) ,_mm256_slli_epi32( w1 , 6 ) ) ) );
|
|
_mm256_storeu_si256(out + 3, _mm256_and_si256 ( mask, _mm256_srli_epi32( w1 , 7) ) );
|
|
w0 = _mm256_lddqu_si256 (compressed + 2);
|
|
_mm256_storeu_si256(out + 4,
|
|
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w1 , 20) ,_mm256_slli_epi32( w0 , 12 ) ) ) );
|
|
_mm256_storeu_si256(out + 5, _mm256_and_si256 ( mask, _mm256_srli_epi32( w0 , 1) ) );
|
|
_mm256_storeu_si256(out + 6, _mm256_and_si256 ( mask, _mm256_srli_epi32( w0 , 14) ) );
|
|
w1 = _mm256_lddqu_si256 (compressed + 3);
|
|
_mm256_storeu_si256(out + 7,
|
|
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w0 , 27) ,_mm256_slli_epi32( w1 , 5 ) ) ) );
|
|
_mm256_storeu_si256(out + 8, _mm256_and_si256 ( mask, _mm256_srli_epi32( w1 , 8) ) );
|
|
w0 = _mm256_lddqu_si256 (compressed + 4);
|
|
_mm256_storeu_si256(out + 9,
|
|
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w1 , 21) ,_mm256_slli_epi32( w0 , 11 ) ) ) );
|
|
_mm256_storeu_si256(out + 10, _mm256_and_si256 ( mask, _mm256_srli_epi32( w0 , 2) ) );
|
|
_mm256_storeu_si256(out + 11, _mm256_and_si256 ( mask, _mm256_srli_epi32( w0 , 15) ) );
|
|
w1 = _mm256_lddqu_si256 (compressed + 5);
|
|
_mm256_storeu_si256(out + 12,
|
|
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w0 , 28) ,_mm256_slli_epi32( w1 , 4 ) ) ) );
|
|
_mm256_storeu_si256(out + 13, _mm256_and_si256 ( mask, _mm256_srli_epi32( w1 , 9) ) );
|
|
w0 = _mm256_lddqu_si256 (compressed + 6);
|
|
_mm256_storeu_si256(out + 14,
|
|
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w1 , 22) ,_mm256_slli_epi32( w0 , 10 ) ) ) );
|
|
_mm256_storeu_si256(out + 15, _mm256_and_si256 ( mask, _mm256_srli_epi32( w0 , 3) ) );
|
|
_mm256_storeu_si256(out + 16, _mm256_and_si256 ( mask, _mm256_srli_epi32( w0 , 16) ) );
|
|
w1 = _mm256_lddqu_si256 (compressed + 7);
|
|
_mm256_storeu_si256(out + 17,
|
|
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w0 , 29) ,_mm256_slli_epi32( w1 , 3 ) ) ) );
|
|
_mm256_storeu_si256(out + 18, _mm256_and_si256 ( mask, _mm256_srli_epi32( w1 , 10) ) );
|
|
w0 = _mm256_lddqu_si256 (compressed + 8);
|
|
_mm256_storeu_si256(out + 19,
|
|
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w1 , 23) ,_mm256_slli_epi32( w0 , 9 ) ) ) );
|
|
_mm256_storeu_si256(out + 20, _mm256_and_si256 ( mask, _mm256_srli_epi32( w0 , 4) ) );
|
|
_mm256_storeu_si256(out + 21, _mm256_and_si256 ( mask, _mm256_srli_epi32( w0 , 17) ) );
|
|
w1 = _mm256_lddqu_si256 (compressed + 9);
|
|
_mm256_storeu_si256(out + 22,
|
|
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w0 , 30) ,_mm256_slli_epi32( w1 , 2 ) ) ) );
|
|
_mm256_storeu_si256(out + 23, _mm256_and_si256 ( mask, _mm256_srli_epi32( w1 , 11) ) );
|
|
w0 = _mm256_lddqu_si256 (compressed + 10);
|
|
_mm256_storeu_si256(out + 24,
|
|
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w1 , 24) ,_mm256_slli_epi32( w0 , 8 ) ) ) );
|
|
_mm256_storeu_si256(out + 25, _mm256_and_si256 ( mask, _mm256_srli_epi32( w0 , 5) ) );
|
|
_mm256_storeu_si256(out + 26, _mm256_and_si256 ( mask, _mm256_srli_epi32( w0 , 18) ) );
|
|
w1 = _mm256_lddqu_si256 (compressed + 11);
|
|
_mm256_storeu_si256(out + 27,
|
|
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w0 , 31) ,_mm256_slli_epi32( w1 , 1 ) ) ) );
|
|
_mm256_storeu_si256(out + 28, _mm256_and_si256 ( mask, _mm256_srli_epi32( w1 , 12) ) );
|
|
w0 = _mm256_lddqu_si256 (compressed + 12);
|
|
_mm256_storeu_si256(out + 29,
|
|
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w1 , 25) ,_mm256_slli_epi32( w0 , 7 ) ) ) );
|
|
_mm256_storeu_si256(out + 30, _mm256_and_si256 ( mask, _mm256_srli_epi32( w0 , 6) ) );
|
|
_mm256_storeu_si256(out + 31, _mm256_srli_epi32( w0 , 19) );
|
|
}
|
|
|
|
|
|
/* we packed 256 14-bit values, touching 14 256-bit words, using 224 bytes */
|
|
static void avxunpackblock14(const __m256i * compressed, uint32_t * pout) {
|
|
/* we are going to access 14 256-bit words */
|
|
__m256i w0, w1;
|
|
__m256i * out = (__m256i *) pout;
|
|
const __m256i mask = _mm256_set1_epi32(16383);
|
|
w0 = _mm256_lddqu_si256 (compressed);
|
|
_mm256_storeu_si256(out + 0, _mm256_and_si256 ( mask, w0 ) );
|
|
_mm256_storeu_si256(out + 1, _mm256_and_si256 ( mask, _mm256_srli_epi32( w0 , 14) ) );
|
|
w1 = _mm256_lddqu_si256 (compressed + 1);
|
|
_mm256_storeu_si256(out + 2,
|
|
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w0 , 28) ,_mm256_slli_epi32( w1 , 4 ) ) ) );
|
|
_mm256_storeu_si256(out + 3, _mm256_and_si256 ( mask, _mm256_srli_epi32( w1 , 10) ) );
|
|
w0 = _mm256_lddqu_si256 (compressed + 2);
|
|
_mm256_storeu_si256(out + 4,
|
|
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w1 , 24) ,_mm256_slli_epi32( w0 , 8 ) ) ) );
|
|
_mm256_storeu_si256(out + 5, _mm256_and_si256 ( mask, _mm256_srli_epi32( w0 , 6) ) );
|
|
w1 = _mm256_lddqu_si256 (compressed + 3);
|
|
_mm256_storeu_si256(out + 6,
|
|
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w0 , 20) ,_mm256_slli_epi32( w1 , 12 ) ) ) );
|
|
_mm256_storeu_si256(out + 7, _mm256_and_si256 ( mask, _mm256_srli_epi32( w1 , 2) ) );
|
|
_mm256_storeu_si256(out + 8, _mm256_and_si256 ( mask, _mm256_srli_epi32( w1 , 16) ) );
|
|
w0 = _mm256_lddqu_si256 (compressed + 4);
|
|
_mm256_storeu_si256(out + 9,
|
|
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w1 , 30) ,_mm256_slli_epi32( w0 , 2 ) ) ) );
|
|
_mm256_storeu_si256(out + 10, _mm256_and_si256 ( mask, _mm256_srli_epi32( w0 , 12) ) );
|
|
w1 = _mm256_lddqu_si256 (compressed + 5);
|
|
_mm256_storeu_si256(out + 11,
|
|
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w0 , 26) ,_mm256_slli_epi32( w1 , 6 ) ) ) );
|
|
_mm256_storeu_si256(out + 12, _mm256_and_si256 ( mask, _mm256_srli_epi32( w1 , 8) ) );
|
|
w0 = _mm256_lddqu_si256 (compressed + 6);
|
|
_mm256_storeu_si256(out + 13,
|
|
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w1 , 22) ,_mm256_slli_epi32( w0 , 10 ) ) ) );
|
|
_mm256_storeu_si256(out + 14, _mm256_and_si256 ( mask, _mm256_srli_epi32( w0 , 4) ) );
|
|
_mm256_storeu_si256(out + 15, _mm256_srli_epi32( w0 , 18) );
|
|
w1 = _mm256_lddqu_si256 (compressed + 7);
|
|
_mm256_storeu_si256(out + 16, _mm256_and_si256 ( mask, w1 ) );
|
|
_mm256_storeu_si256(out + 17, _mm256_and_si256 ( mask, _mm256_srli_epi32( w1 , 14) ) );
|
|
w0 = _mm256_lddqu_si256 (compressed + 8);
|
|
_mm256_storeu_si256(out + 18,
|
|
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w1 , 28) ,_mm256_slli_epi32( w0 , 4 ) ) ) );
|
|
_mm256_storeu_si256(out + 19, _mm256_and_si256 ( mask, _mm256_srli_epi32( w0 , 10) ) );
|
|
w1 = _mm256_lddqu_si256 (compressed + 9);
|
|
_mm256_storeu_si256(out + 20,
|
|
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w0 , 24) ,_mm256_slli_epi32( w1 , 8 ) ) ) );
|
|
_mm256_storeu_si256(out + 21, _mm256_and_si256 ( mask, _mm256_srli_epi32( w1 , 6) ) );
|
|
w0 = _mm256_lddqu_si256 (compressed + 10);
|
|
_mm256_storeu_si256(out + 22,
|
|
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w1 , 20) ,_mm256_slli_epi32( w0 , 12 ) ) ) );
|
|
_mm256_storeu_si256(out + 23, _mm256_and_si256 ( mask, _mm256_srli_epi32( w0 , 2) ) );
|
|
_mm256_storeu_si256(out + 24, _mm256_and_si256 ( mask, _mm256_srli_epi32( w0 , 16) ) );
|
|
w1 = _mm256_lddqu_si256 (compressed + 11);
|
|
_mm256_storeu_si256(out + 25,
|
|
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w0 , 30) ,_mm256_slli_epi32( w1 , 2 ) ) ) );
|
|
_mm256_storeu_si256(out + 26, _mm256_and_si256 ( mask, _mm256_srli_epi32( w1 , 12) ) );
|
|
w0 = _mm256_lddqu_si256 (compressed + 12);
|
|
_mm256_storeu_si256(out + 27,
|
|
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w1 , 26) ,_mm256_slli_epi32( w0 , 6 ) ) ) );
|
|
_mm256_storeu_si256(out + 28, _mm256_and_si256 ( mask, _mm256_srli_epi32( w0 , 8) ) );
|
|
w1 = _mm256_lddqu_si256 (compressed + 13);
|
|
_mm256_storeu_si256(out + 29,
|
|
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w0 , 22) ,_mm256_slli_epi32( w1 , 10 ) ) ) );
|
|
_mm256_storeu_si256(out + 30, _mm256_and_si256 ( mask, _mm256_srli_epi32( w1 , 4) ) );
|
|
_mm256_storeu_si256(out + 31, _mm256_srli_epi32( w1 , 18) );
|
|
}
|
|
|
|
|
|
/* we packed 256 15-bit values, touching 15 256-bit words, using 240 bytes */
|
|
static void avxunpackblock15(const __m256i * compressed, uint32_t * pout) {
|
|
/* we are going to access 15 256-bit words */
|
|
__m256i w0, w1;
|
|
__m256i * out = (__m256i *) pout;
|
|
const __m256i mask = _mm256_set1_epi32(32767);
|
|
w0 = _mm256_lddqu_si256 (compressed);
|
|
_mm256_storeu_si256(out + 0, _mm256_and_si256 ( mask, w0 ) );
|
|
_mm256_storeu_si256(out + 1, _mm256_and_si256 ( mask, _mm256_srli_epi32( w0 , 15) ) );
|
|
w1 = _mm256_lddqu_si256 (compressed + 1);
|
|
_mm256_storeu_si256(out + 2,
|
|
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w0 , 30) ,_mm256_slli_epi32( w1 , 2 ) ) ) );
|
|
_mm256_storeu_si256(out + 3, _mm256_and_si256 ( mask, _mm256_srli_epi32( w1 , 13) ) );
|
|
w0 = _mm256_lddqu_si256 (compressed + 2);
|
|
_mm256_storeu_si256(out + 4,
|
|
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w1 , 28) ,_mm256_slli_epi32( w0 , 4 ) ) ) );
|
|
_mm256_storeu_si256(out + 5, _mm256_and_si256 ( mask, _mm256_srli_epi32( w0 , 11) ) );
|
|
w1 = _mm256_lddqu_si256 (compressed + 3);
|
|
_mm256_storeu_si256(out + 6,
|
|
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w0 , 26) ,_mm256_slli_epi32( w1 , 6 ) ) ) );
|
|
_mm256_storeu_si256(out + 7, _mm256_and_si256 ( mask, _mm256_srli_epi32( w1 , 9) ) );
|
|
w0 = _mm256_lddqu_si256 (compressed + 4);
|
|
_mm256_storeu_si256(out + 8,
|
|
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w1 , 24) ,_mm256_slli_epi32( w0 , 8 ) ) ) );
|
|
_mm256_storeu_si256(out + 9, _mm256_and_si256 ( mask, _mm256_srli_epi32( w0 , 7) ) );
|
|
w1 = _mm256_lddqu_si256 (compressed + 5);
|
|
_mm256_storeu_si256(out + 10,
|
|
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w0 , 22) ,_mm256_slli_epi32( w1 , 10 ) ) ) );
|
|
_mm256_storeu_si256(out + 11, _mm256_and_si256 ( mask, _mm256_srli_epi32( w1 , 5) ) );
|
|
w0 = _mm256_lddqu_si256 (compressed + 6);
|
|
_mm256_storeu_si256(out + 12,
|
|
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w1 , 20) ,_mm256_slli_epi32( w0 , 12 ) ) ) );
|
|
_mm256_storeu_si256(out + 13, _mm256_and_si256 ( mask, _mm256_srli_epi32( w0 , 3) ) );
|
|
w1 = _mm256_lddqu_si256 (compressed + 7);
|
|
_mm256_storeu_si256(out + 14,
|
|
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w0 , 18) ,_mm256_slli_epi32( w1 , 14 ) ) ) );
|
|
_mm256_storeu_si256(out + 15, _mm256_and_si256 ( mask, _mm256_srli_epi32( w1 , 1) ) );
|
|
_mm256_storeu_si256(out + 16, _mm256_and_si256 ( mask, _mm256_srli_epi32( w1 , 16) ) );
|
|
w0 = _mm256_lddqu_si256 (compressed + 8);
|
|
_mm256_storeu_si256(out + 17,
|
|
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w1 , 31) ,_mm256_slli_epi32( w0 , 1 ) ) ) );
|
|
_mm256_storeu_si256(out + 18, _mm256_and_si256 ( mask, _mm256_srli_epi32( w0 , 14) ) );
|
|
w1 = _mm256_lddqu_si256 (compressed + 9);
|
|
_mm256_storeu_si256(out + 19,
|
|
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w0 , 29) ,_mm256_slli_epi32( w1 , 3 ) ) ) );
|
|
_mm256_storeu_si256(out + 20, _mm256_and_si256 ( mask, _mm256_srli_epi32( w1 , 12) ) );
|
|
w0 = _mm256_lddqu_si256 (compressed + 10);
|
|
_mm256_storeu_si256(out + 21,
|
|
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w1 , 27) ,_mm256_slli_epi32( w0 , 5 ) ) ) );
|
|
_mm256_storeu_si256(out + 22, _mm256_and_si256 ( mask, _mm256_srli_epi32( w0 , 10) ) );
|
|
w1 = _mm256_lddqu_si256 (compressed + 11);
|
|
_mm256_storeu_si256(out + 23,
|
|
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w0 , 25) ,_mm256_slli_epi32( w1 , 7 ) ) ) );
|
|
_mm256_storeu_si256(out + 24, _mm256_and_si256 ( mask, _mm256_srli_epi32( w1 , 8) ) );
|
|
w0 = _mm256_lddqu_si256 (compressed + 12);
|
|
_mm256_storeu_si256(out + 25,
|
|
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w1 , 23) ,_mm256_slli_epi32( w0 , 9 ) ) ) );
|
|
_mm256_storeu_si256(out + 26, _mm256_and_si256 ( mask, _mm256_srli_epi32( w0 , 6) ) );
|
|
w1 = _mm256_lddqu_si256 (compressed + 13);
|
|
_mm256_storeu_si256(out + 27,
|
|
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w0 , 21) ,_mm256_slli_epi32( w1 , 11 ) ) ) );
|
|
_mm256_storeu_si256(out + 28, _mm256_and_si256 ( mask, _mm256_srli_epi32( w1 , 4) ) );
|
|
w0 = _mm256_lddqu_si256 (compressed + 14);
|
|
_mm256_storeu_si256(out + 29,
|
|
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w1 , 19) ,_mm256_slli_epi32( w0 , 13 ) ) ) );
|
|
_mm256_storeu_si256(out + 30, _mm256_and_si256 ( mask, _mm256_srli_epi32( w0 , 2) ) );
|
|
_mm256_storeu_si256(out + 31, _mm256_srli_epi32( w0 , 17) );
|
|
}
|
|
|
|
|
|
/* we packed 256 16-bit values, touching 16 256-bit words, using 256 bytes */
|
|
static void avxunpackblock16(const __m256i * compressed, uint32_t * pout) {
|
|
/* we are going to access 16 256-bit words */
|
|
__m256i w0, w1;
|
|
__m256i * out = (__m256i *) pout;
|
|
const __m256i mask = _mm256_set1_epi32(65535);
|
|
w0 = _mm256_lddqu_si256 (compressed);
|
|
_mm256_storeu_si256(out + 0, _mm256_and_si256 ( mask, w0 ) );
|
|
_mm256_storeu_si256(out + 1, _mm256_srli_epi32( w0 , 16) );
|
|
w1 = _mm256_lddqu_si256 (compressed + 1);
|
|
_mm256_storeu_si256(out + 2, _mm256_and_si256 ( mask, w1 ) );
|
|
_mm256_storeu_si256(out + 3, _mm256_srli_epi32( w1 , 16) );
|
|
w0 = _mm256_lddqu_si256 (compressed + 2);
|
|
_mm256_storeu_si256(out + 4, _mm256_and_si256 ( mask, w0 ) );
|
|
_mm256_storeu_si256(out + 5, _mm256_srli_epi32( w0 , 16) );
|
|
w1 = _mm256_lddqu_si256 (compressed + 3);
|
|
_mm256_storeu_si256(out + 6, _mm256_and_si256 ( mask, w1 ) );
|
|
_mm256_storeu_si256(out + 7, _mm256_srli_epi32( w1 , 16) );
|
|
w0 = _mm256_lddqu_si256 (compressed + 4);
|
|
_mm256_storeu_si256(out + 8, _mm256_and_si256 ( mask, w0 ) );
|
|
_mm256_storeu_si256(out + 9, _mm256_srli_epi32( w0 , 16) );
|
|
w1 = _mm256_lddqu_si256 (compressed + 5);
|
|
_mm256_storeu_si256(out + 10, _mm256_and_si256 ( mask, w1 ) );
|
|
_mm256_storeu_si256(out + 11, _mm256_srli_epi32( w1 , 16) );
|
|
w0 = _mm256_lddqu_si256 (compressed + 6);
|
|
_mm256_storeu_si256(out + 12, _mm256_and_si256 ( mask, w0 ) );
|
|
_mm256_storeu_si256(out + 13, _mm256_srli_epi32( w0 , 16) );
|
|
w1 = _mm256_lddqu_si256 (compressed + 7);
|
|
_mm256_storeu_si256(out + 14, _mm256_and_si256 ( mask, w1 ) );
|
|
_mm256_storeu_si256(out + 15, _mm256_srli_epi32( w1 , 16) );
|
|
w0 = _mm256_lddqu_si256 (compressed + 8);
|
|
_mm256_storeu_si256(out + 16, _mm256_and_si256 ( mask, w0 ) );
|
|
_mm256_storeu_si256(out + 17, _mm256_srli_epi32( w0 , 16) );
|
|
w1 = _mm256_lddqu_si256 (compressed + 9);
|
|
_mm256_storeu_si256(out + 18, _mm256_and_si256 ( mask, w1 ) );
|
|
_mm256_storeu_si256(out + 19, _mm256_srli_epi32( w1 , 16) );
|
|
w0 = _mm256_lddqu_si256 (compressed + 10);
|
|
_mm256_storeu_si256(out + 20, _mm256_and_si256 ( mask, w0 ) );
|
|
_mm256_storeu_si256(out + 21, _mm256_srli_epi32( w0 , 16) );
|
|
w1 = _mm256_lddqu_si256 (compressed + 11);
|
|
_mm256_storeu_si256(out + 22, _mm256_and_si256 ( mask, w1 ) );
|
|
_mm256_storeu_si256(out + 23, _mm256_srli_epi32( w1 , 16) );
|
|
w0 = _mm256_lddqu_si256 (compressed + 12);
|
|
_mm256_storeu_si256(out + 24, _mm256_and_si256 ( mask, w0 ) );
|
|
_mm256_storeu_si256(out + 25, _mm256_srli_epi32( w0 , 16) );
|
|
w1 = _mm256_lddqu_si256 (compressed + 13);
|
|
_mm256_storeu_si256(out + 26, _mm256_and_si256 ( mask, w1 ) );
|
|
_mm256_storeu_si256(out + 27, _mm256_srli_epi32( w1 , 16) );
|
|
w0 = _mm256_lddqu_si256 (compressed + 14);
|
|
_mm256_storeu_si256(out + 28, _mm256_and_si256 ( mask, w0 ) );
|
|
_mm256_storeu_si256(out + 29, _mm256_srli_epi32( w0 , 16) );
|
|
w1 = _mm256_lddqu_si256 (compressed + 15);
|
|
_mm256_storeu_si256(out + 30, _mm256_and_si256 ( mask, w1 ) );
|
|
_mm256_storeu_si256(out + 31, _mm256_srli_epi32( w1 , 16) );
|
|
}
|
|
|
|
|
|
/* we packed 256 17-bit values, touching 17 256-bit words, using 272 bytes */
|
|
static void avxunpackblock17(const __m256i * compressed, uint32_t * pout) {
|
|
/* we are going to access 17 256-bit words */
|
|
__m256i w0, w1;
|
|
__m256i * out = (__m256i *) pout;
|
|
const __m256i mask = _mm256_set1_epi32(131071);
|
|
w0 = _mm256_lddqu_si256 (compressed);
|
|
_mm256_storeu_si256(out + 0, _mm256_and_si256 ( mask, w0 ) );
|
|
w1 = _mm256_lddqu_si256 (compressed + 1);
|
|
_mm256_storeu_si256(out + 1,
|
|
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w0 , 17) ,_mm256_slli_epi32( w1 , 15 ) ) ) );
|
|
_mm256_storeu_si256(out + 2, _mm256_and_si256 ( mask, _mm256_srli_epi32( w1 , 2) ) );
|
|
w0 = _mm256_lddqu_si256 (compressed + 2);
|
|
_mm256_storeu_si256(out + 3,
|
|
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w1 , 19) ,_mm256_slli_epi32( w0 , 13 ) ) ) );
|
|
_mm256_storeu_si256(out + 4, _mm256_and_si256 ( mask, _mm256_srli_epi32( w0 , 4) ) );
|
|
w1 = _mm256_lddqu_si256 (compressed + 3);
|
|
_mm256_storeu_si256(out + 5,
|
|
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w0 , 21) ,_mm256_slli_epi32( w1 , 11 ) ) ) );
|
|
_mm256_storeu_si256(out + 6, _mm256_and_si256 ( mask, _mm256_srli_epi32( w1 , 6) ) );
|
|
w0 = _mm256_lddqu_si256 (compressed + 4);
|
|
_mm256_storeu_si256(out + 7,
|
|
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w1 , 23) ,_mm256_slli_epi32( w0 , 9 ) ) ) );
|
|
_mm256_storeu_si256(out + 8, _mm256_and_si256 ( mask, _mm256_srli_epi32( w0 , 8) ) );
|
|
w1 = _mm256_lddqu_si256 (compressed + 5);
|
|
_mm256_storeu_si256(out + 9,
|
|
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w0 , 25) ,_mm256_slli_epi32( w1 , 7 ) ) ) );
|
|
_mm256_storeu_si256(out + 10, _mm256_and_si256 ( mask, _mm256_srli_epi32( w1 , 10) ) );
|
|
w0 = _mm256_lddqu_si256 (compressed + 6);
|
|
_mm256_storeu_si256(out + 11,
|
|
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w1 , 27) ,_mm256_slli_epi32( w0 , 5 ) ) ) );
|
|
_mm256_storeu_si256(out + 12, _mm256_and_si256 ( mask, _mm256_srli_epi32( w0 , 12) ) );
|
|
w1 = _mm256_lddqu_si256 (compressed + 7);
|
|
_mm256_storeu_si256(out + 13,
|
|
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w0 , 29) ,_mm256_slli_epi32( w1 , 3 ) ) ) );
|
|
_mm256_storeu_si256(out + 14, _mm256_and_si256 ( mask, _mm256_srli_epi32( w1 , 14) ) );
|
|
w0 = _mm256_lddqu_si256 (compressed + 8);
|
|
_mm256_storeu_si256(out + 15,
|
|
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w1 , 31) ,_mm256_slli_epi32( w0 , 1 ) ) ) );
|
|
w1 = _mm256_lddqu_si256 (compressed + 9);
|
|
_mm256_storeu_si256(out + 16,
|
|
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w0 , 16) ,_mm256_slli_epi32( w1 , 16 ) ) ) );
|
|
_mm256_storeu_si256(out + 17, _mm256_and_si256 ( mask, _mm256_srli_epi32( w1 , 1) ) );
|
|
w0 = _mm256_lddqu_si256 (compressed + 10);
|
|
_mm256_storeu_si256(out + 18,
|
|
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w1 , 18) ,_mm256_slli_epi32( w0 , 14 ) ) ) );
|
|
_mm256_storeu_si256(out + 19, _mm256_and_si256 ( mask, _mm256_srli_epi32( w0 , 3) ) );
|
|
w1 = _mm256_lddqu_si256 (compressed + 11);
|
|
_mm256_storeu_si256(out + 20,
|
|
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w0 , 20) ,_mm256_slli_epi32( w1 , 12 ) ) ) );
|
|
_mm256_storeu_si256(out + 21, _mm256_and_si256 ( mask, _mm256_srli_epi32( w1 , 5) ) );
|
|
w0 = _mm256_lddqu_si256 (compressed + 12);
|
|
_mm256_storeu_si256(out + 22,
|
|
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w1 , 22) ,_mm256_slli_epi32( w0 , 10 ) ) ) );
|
|
_mm256_storeu_si256(out + 23, _mm256_and_si256 ( mask, _mm256_srli_epi32( w0 , 7) ) );
|
|
w1 = _mm256_lddqu_si256 (compressed + 13);
|
|
_mm256_storeu_si256(out + 24,
|
|
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w0 , 24) ,_mm256_slli_epi32( w1 , 8 ) ) ) );
|
|
_mm256_storeu_si256(out + 25, _mm256_and_si256 ( mask, _mm256_srli_epi32( w1 , 9) ) );
|
|
w0 = _mm256_lddqu_si256 (compressed + 14);
|
|
_mm256_storeu_si256(out + 26,
|
|
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w1 , 26) ,_mm256_slli_epi32( w0 , 6 ) ) ) );
|
|
_mm256_storeu_si256(out + 27, _mm256_and_si256 ( mask, _mm256_srli_epi32( w0 , 11) ) );
|
|
w1 = _mm256_lddqu_si256 (compressed + 15);
|
|
_mm256_storeu_si256(out + 28,
|
|
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w0 , 28) ,_mm256_slli_epi32( w1 , 4 ) ) ) );
|
|
_mm256_storeu_si256(out + 29, _mm256_and_si256 ( mask, _mm256_srli_epi32( w1 , 13) ) );
|
|
w0 = _mm256_lddqu_si256 (compressed + 16);
|
|
_mm256_storeu_si256(out + 30,
|
|
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w1 , 30) ,_mm256_slli_epi32( w0 , 2 ) ) ) );
|
|
_mm256_storeu_si256(out + 31, _mm256_srli_epi32( w0 , 15) );
|
|
}
|
|
|
|
|
|
/* we packed 256 18-bit values, touching 18 256-bit words, using 288 bytes */
|
|
static void avxunpackblock18(const __m256i * compressed, uint32_t * pout) {
|
|
/* we are going to access 18 256-bit words */
|
|
__m256i w0, w1;
|
|
__m256i * out = (__m256i *) pout;
|
|
const __m256i mask = _mm256_set1_epi32(262143);
|
|
w0 = _mm256_lddqu_si256 (compressed);
|
|
_mm256_storeu_si256(out + 0, _mm256_and_si256 ( mask, w0 ) );
|
|
w1 = _mm256_lddqu_si256 (compressed + 1);
|
|
_mm256_storeu_si256(out + 1,
|
|
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w0 , 18) ,_mm256_slli_epi32( w1 , 14 ) ) ) );
|
|
_mm256_storeu_si256(out + 2, _mm256_and_si256 ( mask, _mm256_srli_epi32( w1 , 4) ) );
|
|
w0 = _mm256_lddqu_si256 (compressed + 2);
|
|
_mm256_storeu_si256(out + 3,
|
|
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w1 , 22) ,_mm256_slli_epi32( w0 , 10 ) ) ) );
|
|
_mm256_storeu_si256(out + 4, _mm256_and_si256 ( mask, _mm256_srli_epi32( w0 , 8) ) );
|
|
w1 = _mm256_lddqu_si256 (compressed + 3);
|
|
_mm256_storeu_si256(out + 5,
|
|
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w0 , 26) ,_mm256_slli_epi32( w1 , 6 ) ) ) );
|
|
_mm256_storeu_si256(out + 6, _mm256_and_si256 ( mask, _mm256_srli_epi32( w1 , 12) ) );
|
|
w0 = _mm256_lddqu_si256 (compressed + 4);
|
|
_mm256_storeu_si256(out + 7,
|
|
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w1 , 30) ,_mm256_slli_epi32( w0 , 2 ) ) ) );
|
|
w1 = _mm256_lddqu_si256 (compressed + 5);
|
|
_mm256_storeu_si256(out + 8,
|
|
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w0 , 16) ,_mm256_slli_epi32( w1 , 16 ) ) ) );
|
|
_mm256_storeu_si256(out + 9, _mm256_and_si256 ( mask, _mm256_srli_epi32( w1 , 2) ) );
|
|
w0 = _mm256_lddqu_si256 (compressed + 6);
|
|
_mm256_storeu_si256(out + 10,
|
|
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w1 , 20) ,_mm256_slli_epi32( w0 , 12 ) ) ) );
|
|
_mm256_storeu_si256(out + 11, _mm256_and_si256 ( mask, _mm256_srli_epi32( w0 , 6) ) );
|
|
w1 = _mm256_lddqu_si256 (compressed + 7);
|
|
_mm256_storeu_si256(out + 12,
|
|
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w0 , 24) ,_mm256_slli_epi32( w1 , 8 ) ) ) );
|
|
_mm256_storeu_si256(out + 13, _mm256_and_si256 ( mask, _mm256_srli_epi32( w1 , 10) ) );
|
|
w0 = _mm256_lddqu_si256 (compressed + 8);
|
|
_mm256_storeu_si256(out + 14,
|
|
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w1 , 28) ,_mm256_slli_epi32( w0 , 4 ) ) ) );
|
|
_mm256_storeu_si256(out + 15, _mm256_srli_epi32( w0 , 14) );
|
|
w1 = _mm256_lddqu_si256 (compressed + 9);
|
|
_mm256_storeu_si256(out + 16, _mm256_and_si256 ( mask, w1 ) );
|
|
w0 = _mm256_lddqu_si256 (compressed + 10);
|
|
_mm256_storeu_si256(out + 17,
|
|
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w1 , 18) ,_mm256_slli_epi32( w0 , 14 ) ) ) );
|
|
_mm256_storeu_si256(out + 18, _mm256_and_si256 ( mask, _mm256_srli_epi32( w0 , 4) ) );
|
|
w1 = _mm256_lddqu_si256 (compressed + 11);
|
|
_mm256_storeu_si256(out + 19,
|
|
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w0 , 22) ,_mm256_slli_epi32( w1 , 10 ) ) ) );
|
|
_mm256_storeu_si256(out + 20, _mm256_and_si256 ( mask, _mm256_srli_epi32( w1 , 8) ) );
|
|
w0 = _mm256_lddqu_si256 (compressed + 12);
|
|
_mm256_storeu_si256(out + 21,
|
|
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w1 , 26) ,_mm256_slli_epi32( w0 , 6 ) ) ) );
|
|
_mm256_storeu_si256(out + 22, _mm256_and_si256 ( mask, _mm256_srli_epi32( w0 , 12) ) );
|
|
w1 = _mm256_lddqu_si256 (compressed + 13);
|
|
_mm256_storeu_si256(out + 23,
|
|
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w0 , 30) ,_mm256_slli_epi32( w1 , 2 ) ) ) );
|
|
w0 = _mm256_lddqu_si256 (compressed + 14);
|
|
_mm256_storeu_si256(out + 24,
|
|
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w1 , 16) ,_mm256_slli_epi32( w0 , 16 ) ) ) );
|
|
_mm256_storeu_si256(out + 25, _mm256_and_si256 ( mask, _mm256_srli_epi32( w0 , 2) ) );
|
|
w1 = _mm256_lddqu_si256 (compressed + 15);
|
|
_mm256_storeu_si256(out + 26,
|
|
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w0 , 20) ,_mm256_slli_epi32( w1 , 12 ) ) ) );
|
|
_mm256_storeu_si256(out + 27, _mm256_and_si256 ( mask, _mm256_srli_epi32( w1 , 6) ) );
|
|
w0 = _mm256_lddqu_si256 (compressed + 16);
|
|
_mm256_storeu_si256(out + 28,
|
|
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w1 , 24) ,_mm256_slli_epi32( w0 , 8 ) ) ) );
|
|
_mm256_storeu_si256(out + 29, _mm256_and_si256 ( mask, _mm256_srli_epi32( w0 , 10) ) );
|
|
w1 = _mm256_lddqu_si256 (compressed + 17);
|
|
_mm256_storeu_si256(out + 30,
|
|
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w0 , 28) ,_mm256_slli_epi32( w1 , 4 ) ) ) );
|
|
_mm256_storeu_si256(out + 31, _mm256_srli_epi32( w1 , 14) );
|
|
}
|
|
|
|
|
|
/* we packed 256 19-bit values, touching 19 256-bit words, using 304 bytes */
|
|
static void avxunpackblock19(const __m256i * compressed, uint32_t * pout) {
|
|
/* we are going to access 19 256-bit words */
|
|
__m256i w0, w1;
|
|
__m256i * out = (__m256i *) pout;
|
|
const __m256i mask = _mm256_set1_epi32(524287);
|
|
w0 = _mm256_lddqu_si256 (compressed);
|
|
_mm256_storeu_si256(out + 0, _mm256_and_si256 ( mask, w0 ) );
|
|
w1 = _mm256_lddqu_si256 (compressed + 1);
|
|
_mm256_storeu_si256(out + 1,
|
|
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w0 , 19) ,_mm256_slli_epi32( w1 , 13 ) ) ) );
|
|
_mm256_storeu_si256(out + 2, _mm256_and_si256 ( mask, _mm256_srli_epi32( w1 , 6) ) );
|
|
w0 = _mm256_lddqu_si256 (compressed + 2);
|
|
_mm256_storeu_si256(out + 3,
|
|
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w1 , 25) ,_mm256_slli_epi32( w0 , 7 ) ) ) );
|
|
_mm256_storeu_si256(out + 4, _mm256_and_si256 ( mask, _mm256_srli_epi32( w0 , 12) ) );
|
|
w1 = _mm256_lddqu_si256 (compressed + 3);
|
|
_mm256_storeu_si256(out + 5,
|
|
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w0 , 31) ,_mm256_slli_epi32( w1 , 1 ) ) ) );
|
|
w0 = _mm256_lddqu_si256 (compressed + 4);
|
|
_mm256_storeu_si256(out + 6,
|
|
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w1 , 18) ,_mm256_slli_epi32( w0 , 14 ) ) ) );
|
|
_mm256_storeu_si256(out + 7, _mm256_and_si256 ( mask, _mm256_srli_epi32( w0 , 5) ) );
|
|
w1 = _mm256_lddqu_si256 (compressed + 5);
|
|
_mm256_storeu_si256(out + 8,
|
|
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w0 , 24) ,_mm256_slli_epi32( w1 , 8 ) ) ) );
|
|
_mm256_storeu_si256(out + 9, _mm256_and_si256 ( mask, _mm256_srli_epi32( w1 , 11) ) );
|
|
w0 = _mm256_lddqu_si256 (compressed + 6);
|
|
_mm256_storeu_si256(out + 10,
|
|
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w1 , 30) ,_mm256_slli_epi32( w0 , 2 ) ) ) );
|
|
w1 = _mm256_lddqu_si256 (compressed + 7);
|
|
_mm256_storeu_si256(out + 11,
|
|
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w0 , 17) ,_mm256_slli_epi32( w1 , 15 ) ) ) );
|
|
_mm256_storeu_si256(out + 12, _mm256_and_si256 ( mask, _mm256_srli_epi32( w1 , 4) ) );
|
|
w0 = _mm256_lddqu_si256 (compressed + 8);
|
|
_mm256_storeu_si256(out + 13,
|
|
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w1 , 23) ,_mm256_slli_epi32( w0 , 9 ) ) ) );
|
|
_mm256_storeu_si256(out + 14, _mm256_and_si256 ( mask, _mm256_srli_epi32( w0 , 10) ) );
|
|
w1 = _mm256_lddqu_si256 (compressed + 9);
|
|
_mm256_storeu_si256(out + 15,
|
|
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w0 , 29) ,_mm256_slli_epi32( w1 , 3 ) ) ) );
|
|
w0 = _mm256_lddqu_si256 (compressed + 10);
|
|
_mm256_storeu_si256(out + 16,
|
|
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w1 , 16) ,_mm256_slli_epi32( w0 , 16 ) ) ) );
|
|
_mm256_storeu_si256(out + 17, _mm256_and_si256 ( mask, _mm256_srli_epi32( w0 , 3) ) );
|
|
w1 = _mm256_lddqu_si256 (compressed + 11);
|
|
_mm256_storeu_si256(out + 18,
|
|
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w0 , 22) ,_mm256_slli_epi32( w1 , 10 ) ) ) );
|
|
_mm256_storeu_si256(out + 19, _mm256_and_si256 ( mask, _mm256_srli_epi32( w1 , 9) ) );
|
|
w0 = _mm256_lddqu_si256 (compressed + 12);
|
|
_mm256_storeu_si256(out + 20,
|
|
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w1 , 28) ,_mm256_slli_epi32( w0 , 4 ) ) ) );
|
|
w1 = _mm256_lddqu_si256 (compressed + 13);
|
|
_mm256_storeu_si256(out + 21,
|
|
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w0 , 15) ,_mm256_slli_epi32( w1 , 17 ) ) ) );
|
|
_mm256_storeu_si256(out + 22, _mm256_and_si256 ( mask, _mm256_srli_epi32( w1 , 2) ) );
|
|
w0 = _mm256_lddqu_si256 (compressed + 14);
|
|
_mm256_storeu_si256(out + 23,
|
|
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w1 , 21) ,_mm256_slli_epi32( w0 , 11 ) ) ) );
|
|
_mm256_storeu_si256(out + 24, _mm256_and_si256 ( mask, _mm256_srli_epi32( w0 , 8) ) );
|
|
w1 = _mm256_lddqu_si256 (compressed + 15);
|
|
_mm256_storeu_si256(out + 25,
|
|
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w0 , 27) ,_mm256_slli_epi32( w1 , 5 ) ) ) );
|
|
w0 = _mm256_lddqu_si256 (compressed + 16);
|
|
_mm256_storeu_si256(out + 26,
|
|
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w1 , 14) ,_mm256_slli_epi32( w0 , 18 ) ) ) );
|
|
_mm256_storeu_si256(out + 27, _mm256_and_si256 ( mask, _mm256_srli_epi32( w0 , 1) ) );
|
|
w1 = _mm256_lddqu_si256 (compressed + 17);
|
|
_mm256_storeu_si256(out + 28,
|
|
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w0 , 20) ,_mm256_slli_epi32( w1 , 12 ) ) ) );
|
|
_mm256_storeu_si256(out + 29, _mm256_and_si256 ( mask, _mm256_srli_epi32( w1 , 7) ) );
|
|
w0 = _mm256_lddqu_si256 (compressed + 18);
|
|
_mm256_storeu_si256(out + 30,
|
|
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w1 , 26) ,_mm256_slli_epi32( w0 , 6 ) ) ) );
|
|
_mm256_storeu_si256(out + 31, _mm256_srli_epi32( w0 , 13) );
|
|
}
|
|
|
|
|
|
/* we packed 256 20-bit values, touching 20 256-bit words, using 320 bytes */
|
|
static void avxunpackblock20(const __m256i * compressed, uint32_t * pout) {
|
|
/* we are going to access 20 256-bit words */
|
|
__m256i w0, w1;
|
|
__m256i * out = (__m256i *) pout;
|
|
const __m256i mask = _mm256_set1_epi32(1048575);
|
|
w0 = _mm256_lddqu_si256 (compressed);
|
|
_mm256_storeu_si256(out + 0, _mm256_and_si256 ( mask, w0 ) );
|
|
w1 = _mm256_lddqu_si256 (compressed + 1);
|
|
_mm256_storeu_si256(out + 1,
|
|
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w0 , 20) ,_mm256_slli_epi32( w1 , 12 ) ) ) );
|
|
_mm256_storeu_si256(out + 2, _mm256_and_si256 ( mask, _mm256_srli_epi32( w1 , 8) ) );
|
|
w0 = _mm256_lddqu_si256 (compressed + 2);
|
|
_mm256_storeu_si256(out + 3,
|
|
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w1 , 28) ,_mm256_slli_epi32( w0 , 4 ) ) ) );
|
|
w1 = _mm256_lddqu_si256 (compressed + 3);
|
|
_mm256_storeu_si256(out + 4,
|
|
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w0 , 16) ,_mm256_slli_epi32( w1 , 16 ) ) ) );
|
|
_mm256_storeu_si256(out + 5, _mm256_and_si256 ( mask, _mm256_srli_epi32( w1 , 4) ) );
|
|
w0 = _mm256_lddqu_si256 (compressed + 4);
|
|
_mm256_storeu_si256(out + 6,
|
|
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w1 , 24) ,_mm256_slli_epi32( w0 , 8 ) ) ) );
|
|
_mm256_storeu_si256(out + 7, _mm256_srli_epi32( w0 , 12) );
|
|
w1 = _mm256_lddqu_si256 (compressed + 5);
|
|
_mm256_storeu_si256(out + 8, _mm256_and_si256 ( mask, w1 ) );
|
|
w0 = _mm256_lddqu_si256 (compressed + 6);
|
|
_mm256_storeu_si256(out + 9,
|
|
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w1 , 20) ,_mm256_slli_epi32( w0 , 12 ) ) ) );
|
|
_mm256_storeu_si256(out + 10, _mm256_and_si256 ( mask, _mm256_srli_epi32( w0 , 8) ) );
|
|
w1 = _mm256_lddqu_si256 (compressed + 7);
|
|
_mm256_storeu_si256(out + 11,
|
|
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w0 , 28) ,_mm256_slli_epi32( w1 , 4 ) ) ) );
|
|
w0 = _mm256_lddqu_si256 (compressed + 8);
|
|
_mm256_storeu_si256(out + 12,
|
|
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w1 , 16) ,_mm256_slli_epi32( w0 , 16 ) ) ) );
|
|
_mm256_storeu_si256(out + 13, _mm256_and_si256 ( mask, _mm256_srli_epi32( w0 , 4) ) );
|
|
w1 = _mm256_lddqu_si256 (compressed + 9);
|
|
_mm256_storeu_si256(out + 14,
|
|
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w0 , 24) ,_mm256_slli_epi32( w1 , 8 ) ) ) );
|
|
_mm256_storeu_si256(out + 15, _mm256_srli_epi32( w1 , 12) );
|
|
w0 = _mm256_lddqu_si256 (compressed + 10);
|
|
_mm256_storeu_si256(out + 16, _mm256_and_si256 ( mask, w0 ) );
|
|
w1 = _mm256_lddqu_si256 (compressed + 11);
|
|
_mm256_storeu_si256(out + 17,
|
|
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w0 , 20) ,_mm256_slli_epi32( w1 , 12 ) ) ) );
|
|
_mm256_storeu_si256(out + 18, _mm256_and_si256 ( mask, _mm256_srli_epi32( w1 , 8) ) );
|
|
w0 = _mm256_lddqu_si256 (compressed + 12);
|
|
_mm256_storeu_si256(out + 19,
|
|
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w1 , 28) ,_mm256_slli_epi32( w0 , 4 ) ) ) );
|
|
w1 = _mm256_lddqu_si256 (compressed + 13);
|
|
_mm256_storeu_si256(out + 20,
|
|
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w0 , 16) ,_mm256_slli_epi32( w1 , 16 ) ) ) );
|
|
_mm256_storeu_si256(out + 21, _mm256_and_si256 ( mask, _mm256_srli_epi32( w1 , 4) ) );
|
|
w0 = _mm256_lddqu_si256 (compressed + 14);
|
|
_mm256_storeu_si256(out + 22,
|
|
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w1 , 24) ,_mm256_slli_epi32( w0 , 8 ) ) ) );
|
|
_mm256_storeu_si256(out + 23, _mm256_srli_epi32( w0 , 12) );
|
|
w1 = _mm256_lddqu_si256 (compressed + 15);
|
|
_mm256_storeu_si256(out + 24, _mm256_and_si256 ( mask, w1 ) );
|
|
w0 = _mm256_lddqu_si256 (compressed + 16);
|
|
_mm256_storeu_si256(out + 25,
|
|
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w1 , 20) ,_mm256_slli_epi32( w0 , 12 ) ) ) );
|
|
_mm256_storeu_si256(out + 26, _mm256_and_si256 ( mask, _mm256_srli_epi32( w0 , 8) ) );
|
|
w1 = _mm256_lddqu_si256 (compressed + 17);
|
|
_mm256_storeu_si256(out + 27,
|
|
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w0 , 28) ,_mm256_slli_epi32( w1 , 4 ) ) ) );
|
|
w0 = _mm256_lddqu_si256 (compressed + 18);
|
|
_mm256_storeu_si256(out + 28,
|
|
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w1 , 16) ,_mm256_slli_epi32( w0 , 16 ) ) ) );
|
|
_mm256_storeu_si256(out + 29, _mm256_and_si256 ( mask, _mm256_srli_epi32( w0 , 4) ) );
|
|
w1 = _mm256_lddqu_si256 (compressed + 19);
|
|
_mm256_storeu_si256(out + 30,
|
|
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w0 , 24) ,_mm256_slli_epi32( w1 , 8 ) ) ) );
|
|
_mm256_storeu_si256(out + 31, _mm256_srli_epi32( w1 , 12) );
|
|
}
|
|
|
|
|
|
/* we packed 256 21-bit values, touching 21 256-bit words, using 336 bytes */
|
|
static void avxunpackblock21(const __m256i * compressed, uint32_t * pout) {
|
|
/* we are going to access 21 256-bit words */
|
|
__m256i w0, w1;
|
|
__m256i * out = (__m256i *) pout;
|
|
const __m256i mask = _mm256_set1_epi32(2097151);
|
|
w0 = _mm256_lddqu_si256 (compressed);
|
|
_mm256_storeu_si256(out + 0, _mm256_and_si256 ( mask, w0 ) );
|
|
w1 = _mm256_lddqu_si256 (compressed + 1);
|
|
_mm256_storeu_si256(out + 1,
|
|
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w0 , 21) ,_mm256_slli_epi32( w1 , 11 ) ) ) );
|
|
_mm256_storeu_si256(out + 2, _mm256_and_si256 ( mask, _mm256_srli_epi32( w1 , 10) ) );
|
|
w0 = _mm256_lddqu_si256 (compressed + 2);
|
|
_mm256_storeu_si256(out + 3,
|
|
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w1 , 31) ,_mm256_slli_epi32( w0 , 1 ) ) ) );
|
|
w1 = _mm256_lddqu_si256 (compressed + 3);
|
|
_mm256_storeu_si256(out + 4,
|
|
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w0 , 20) ,_mm256_slli_epi32( w1 , 12 ) ) ) );
|
|
_mm256_storeu_si256(out + 5, _mm256_and_si256 ( mask, _mm256_srli_epi32( w1 , 9) ) );
|
|
w0 = _mm256_lddqu_si256 (compressed + 4);
|
|
_mm256_storeu_si256(out + 6,
|
|
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w1 , 30) ,_mm256_slli_epi32( w0 , 2 ) ) ) );
|
|
w1 = _mm256_lddqu_si256 (compressed + 5);
|
|
_mm256_storeu_si256(out + 7,
|
|
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w0 , 19) ,_mm256_slli_epi32( w1 , 13 ) ) ) );
|
|
_mm256_storeu_si256(out + 8, _mm256_and_si256 ( mask, _mm256_srli_epi32( w1 , 8) ) );
|
|
w0 = _mm256_lddqu_si256 (compressed + 6);
|
|
_mm256_storeu_si256(out + 9,
|
|
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w1 , 29) ,_mm256_slli_epi32( w0 , 3 ) ) ) );
|
|
w1 = _mm256_lddqu_si256 (compressed + 7);
|
|
_mm256_storeu_si256(out + 10,
|
|
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w0 , 18) ,_mm256_slli_epi32( w1 , 14 ) ) ) );
|
|
_mm256_storeu_si256(out + 11, _mm256_and_si256 ( mask, _mm256_srli_epi32( w1 , 7) ) );
|
|
w0 = _mm256_lddqu_si256 (compressed + 8);
|
|
_mm256_storeu_si256(out + 12,
|
|
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w1 , 28) ,_mm256_slli_epi32( w0 , 4 ) ) ) );
|
|
w1 = _mm256_lddqu_si256 (compressed + 9);
|
|
_mm256_storeu_si256(out + 13,
|
|
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w0 , 17) ,_mm256_slli_epi32( w1 , 15 ) ) ) );
|
|
_mm256_storeu_si256(out + 14, _mm256_and_si256 ( mask, _mm256_srli_epi32( w1 , 6) ) );
|
|
w0 = _mm256_lddqu_si256 (compressed + 10);
|
|
_mm256_storeu_si256(out + 15,
|
|
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w1 , 27) ,_mm256_slli_epi32( w0 , 5 ) ) ) );
|
|
w1 = _mm256_lddqu_si256 (compressed + 11);
|
|
_mm256_storeu_si256(out + 16,
|
|
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w0 , 16) ,_mm256_slli_epi32( w1 , 16 ) ) ) );
|
|
_mm256_storeu_si256(out + 17, _mm256_and_si256 ( mask, _mm256_srli_epi32( w1 , 5) ) );
|
|
w0 = _mm256_lddqu_si256 (compressed + 12);
|
|
_mm256_storeu_si256(out + 18,
|
|
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w1 , 26) ,_mm256_slli_epi32( w0 , 6 ) ) ) );
|
|
w1 = _mm256_lddqu_si256 (compressed + 13);
|
|
_mm256_storeu_si256(out + 19,
|
|
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w0 , 15) ,_mm256_slli_epi32( w1 , 17 ) ) ) );
|
|
_mm256_storeu_si256(out + 20, _mm256_and_si256 ( mask, _mm256_srli_epi32( w1 , 4) ) );
|
|
w0 = _mm256_lddqu_si256 (compressed + 14);
|
|
_mm256_storeu_si256(out + 21,
|
|
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w1 , 25) ,_mm256_slli_epi32( w0 , 7 ) ) ) );
|
|
w1 = _mm256_lddqu_si256 (compressed + 15);
|
|
_mm256_storeu_si256(out + 22,
|
|
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w0 , 14) ,_mm256_slli_epi32( w1 , 18 ) ) ) );
|
|
_mm256_storeu_si256(out + 23, _mm256_and_si256 ( mask, _mm256_srli_epi32( w1 , 3) ) );
|
|
w0 = _mm256_lddqu_si256 (compressed + 16);
|
|
_mm256_storeu_si256(out + 24,
|
|
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w1 , 24) ,_mm256_slli_epi32( w0 , 8 ) ) ) );
|
|
w1 = _mm256_lddqu_si256 (compressed + 17);
|
|
_mm256_storeu_si256(out + 25,
|
|
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w0 , 13) ,_mm256_slli_epi32( w1 , 19 ) ) ) );
|
|
_mm256_storeu_si256(out + 26, _mm256_and_si256 ( mask, _mm256_srli_epi32( w1 , 2) ) );
|
|
w0 = _mm256_lddqu_si256 (compressed + 18);
|
|
_mm256_storeu_si256(out + 27,
|
|
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w1 , 23) ,_mm256_slli_epi32( w0 , 9 ) ) ) );
|
|
w1 = _mm256_lddqu_si256 (compressed + 19);
|
|
_mm256_storeu_si256(out + 28,
|
|
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w0 , 12) ,_mm256_slli_epi32( w1 , 20 ) ) ) );
|
|
_mm256_storeu_si256(out + 29, _mm256_and_si256 ( mask, _mm256_srli_epi32( w1 , 1) ) );
|
|
w0 = _mm256_lddqu_si256 (compressed + 20);
|
|
_mm256_storeu_si256(out + 30,
|
|
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w1 , 22) ,_mm256_slli_epi32( w0 , 10 ) ) ) );
|
|
_mm256_storeu_si256(out + 31, _mm256_srli_epi32( w0 , 11) );
|
|
}
|
|
|
|
|
|
/* we packed 256 22-bit values, touching 22 256-bit words, using 352 bytes */
|
|
static void avxunpackblock22(const __m256i * compressed, uint32_t * pout) {
|
|
/* we are going to access 22 256-bit words */
|
|
__m256i w0, w1;
|
|
__m256i * out = (__m256i *) pout;
|
|
const __m256i mask = _mm256_set1_epi32(4194303);
|
|
w0 = _mm256_lddqu_si256 (compressed);
|
|
_mm256_storeu_si256(out + 0, _mm256_and_si256 ( mask, w0 ) );
|
|
w1 = _mm256_lddqu_si256 (compressed + 1);
|
|
_mm256_storeu_si256(out + 1,
|
|
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w0 , 22) ,_mm256_slli_epi32( w1 , 10 ) ) ) );
|
|
w0 = _mm256_lddqu_si256 (compressed + 2);
|
|
_mm256_storeu_si256(out + 2,
|
|
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w1 , 12) ,_mm256_slli_epi32( w0 , 20 ) ) ) );
|
|
_mm256_storeu_si256(out + 3, _mm256_and_si256 ( mask, _mm256_srli_epi32( w0 , 2) ) );
|
|
w1 = _mm256_lddqu_si256 (compressed + 3);
|
|
_mm256_storeu_si256(out + 4,
|
|
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w0 , 24) ,_mm256_slli_epi32( w1 , 8 ) ) ) );
|
|
w0 = _mm256_lddqu_si256 (compressed + 4);
|
|
_mm256_storeu_si256(out + 5,
|
|
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w1 , 14) ,_mm256_slli_epi32( w0 , 18 ) ) ) );
|
|
_mm256_storeu_si256(out + 6, _mm256_and_si256 ( mask, _mm256_srli_epi32( w0 , 4) ) );
|
|
w1 = _mm256_lddqu_si256 (compressed + 5);
|
|
_mm256_storeu_si256(out + 7,
|
|
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w0 , 26) ,_mm256_slli_epi32( w1 , 6 ) ) ) );
|
|
w0 = _mm256_lddqu_si256 (compressed + 6);
|
|
_mm256_storeu_si256(out + 8,
|
|
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w1 , 16) ,_mm256_slli_epi32( w0 , 16 ) ) ) );
|
|
_mm256_storeu_si256(out + 9, _mm256_and_si256 ( mask, _mm256_srli_epi32( w0 , 6) ) );
|
|
w1 = _mm256_lddqu_si256 (compressed + 7);
|
|
_mm256_storeu_si256(out + 10,
|
|
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w0 , 28) ,_mm256_slli_epi32( w1 , 4 ) ) ) );
|
|
w0 = _mm256_lddqu_si256 (compressed + 8);
|
|
_mm256_storeu_si256(out + 11,
|
|
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w1 , 18) ,_mm256_slli_epi32( w0 , 14 ) ) ) );
|
|
_mm256_storeu_si256(out + 12, _mm256_and_si256 ( mask, _mm256_srli_epi32( w0 , 8) ) );
|
|
w1 = _mm256_lddqu_si256 (compressed + 9);
|
|
_mm256_storeu_si256(out + 13,
|
|
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w0 , 30) ,_mm256_slli_epi32( w1 , 2 ) ) ) );
|
|
w0 = _mm256_lddqu_si256 (compressed + 10);
|
|
_mm256_storeu_si256(out + 14,
|
|
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w1 , 20) ,_mm256_slli_epi32( w0 , 12 ) ) ) );
|
|
_mm256_storeu_si256(out + 15, _mm256_srli_epi32( w0 , 10) );
|
|
w1 = _mm256_lddqu_si256 (compressed + 11);
|
|
_mm256_storeu_si256(out + 16, _mm256_and_si256 ( mask, w1 ) );
|
|
w0 = _mm256_lddqu_si256 (compressed + 12);
|
|
_mm256_storeu_si256(out + 17,
|
|
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w1 , 22) ,_mm256_slli_epi32( w0 , 10 ) ) ) );
|
|
w1 = _mm256_lddqu_si256 (compressed + 13);
|
|
_mm256_storeu_si256(out + 18,
|
|
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w0 , 12) ,_mm256_slli_epi32( w1 , 20 ) ) ) );
|
|
_mm256_storeu_si256(out + 19, _mm256_and_si256 ( mask, _mm256_srli_epi32( w1 , 2) ) );
|
|
w0 = _mm256_lddqu_si256 (compressed + 14);
|
|
_mm256_storeu_si256(out + 20,
|
|
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w1 , 24) ,_mm256_slli_epi32( w0 , 8 ) ) ) );
|
|
w1 = _mm256_lddqu_si256 (compressed + 15);
|
|
_mm256_storeu_si256(out + 21,
|
|
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w0 , 14) ,_mm256_slli_epi32( w1 , 18 ) ) ) );
|
|
_mm256_storeu_si256(out + 22, _mm256_and_si256 ( mask, _mm256_srli_epi32( w1 , 4) ) );
|
|
w0 = _mm256_lddqu_si256 (compressed + 16);
|
|
_mm256_storeu_si256(out + 23,
|
|
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w1 , 26) ,_mm256_slli_epi32( w0 , 6 ) ) ) );
|
|
w1 = _mm256_lddqu_si256 (compressed + 17);
|
|
_mm256_storeu_si256(out + 24,
|
|
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w0 , 16) ,_mm256_slli_epi32( w1 , 16 ) ) ) );
|
|
_mm256_storeu_si256(out + 25, _mm256_and_si256 ( mask, _mm256_srli_epi32( w1 , 6) ) );
|
|
w0 = _mm256_lddqu_si256 (compressed + 18);
|
|
_mm256_storeu_si256(out + 26,
|
|
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w1 , 28) ,_mm256_slli_epi32( w0 , 4 ) ) ) );
|
|
w1 = _mm256_lddqu_si256 (compressed + 19);
|
|
_mm256_storeu_si256(out + 27,
|
|
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w0 , 18) ,_mm256_slli_epi32( w1 , 14 ) ) ) );
|
|
_mm256_storeu_si256(out + 28, _mm256_and_si256 ( mask, _mm256_srli_epi32( w1 , 8) ) );
|
|
w0 = _mm256_lddqu_si256 (compressed + 20);
|
|
_mm256_storeu_si256(out + 29,
|
|
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w1 , 30) ,_mm256_slli_epi32( w0 , 2 ) ) ) );
|
|
w1 = _mm256_lddqu_si256 (compressed + 21);
|
|
_mm256_storeu_si256(out + 30,
|
|
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w0 , 20) ,_mm256_slli_epi32( w1 , 12 ) ) ) );
|
|
_mm256_storeu_si256(out + 31, _mm256_srli_epi32( w1 , 10) );
|
|
}
|
|
|
|
|
|
/* we packed 256 23-bit values, touching 23 256-bit words, using 368 bytes */
|
|
static void avxunpackblock23(const __m256i * compressed, uint32_t * pout) {
|
|
/* we are going to access 23 256-bit words */
|
|
__m256i w0, w1;
|
|
__m256i * out = (__m256i *) pout;
|
|
const __m256i mask = _mm256_set1_epi32(8388607);
|
|
w0 = _mm256_lddqu_si256 (compressed);
|
|
_mm256_storeu_si256(out + 0, _mm256_and_si256 ( mask, w0 ) );
|
|
w1 = _mm256_lddqu_si256 (compressed + 1);
|
|
_mm256_storeu_si256(out + 1,
|
|
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w0 , 23) ,_mm256_slli_epi32( w1 , 9 ) ) ) );
|
|
w0 = _mm256_lddqu_si256 (compressed + 2);
|
|
_mm256_storeu_si256(out + 2,
|
|
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w1 , 14) ,_mm256_slli_epi32( w0 , 18 ) ) ) );
|
|
_mm256_storeu_si256(out + 3, _mm256_and_si256 ( mask, _mm256_srli_epi32( w0 , 5) ) );
|
|
w1 = _mm256_lddqu_si256 (compressed + 3);
|
|
_mm256_storeu_si256(out + 4,
|
|
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w0 , 28) ,_mm256_slli_epi32( w1 , 4 ) ) ) );
|
|
w0 = _mm256_lddqu_si256 (compressed + 4);
|
|
_mm256_storeu_si256(out + 5,
|
|
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w1 , 19) ,_mm256_slli_epi32( w0 , 13 ) ) ) );
|
|
w1 = _mm256_lddqu_si256 (compressed + 5);
|
|
_mm256_storeu_si256(out + 6,
|
|
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w0 , 10) ,_mm256_slli_epi32( w1 , 22 ) ) ) );
|
|
_mm256_storeu_si256(out + 7, _mm256_and_si256 ( mask, _mm256_srli_epi32( w1 , 1) ) );
|
|
w0 = _mm256_lddqu_si256 (compressed + 6);
|
|
_mm256_storeu_si256(out + 8,
|
|
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w1 , 24) ,_mm256_slli_epi32( w0 , 8 ) ) ) );
|
|
w1 = _mm256_lddqu_si256 (compressed + 7);
|
|
_mm256_storeu_si256(out + 9,
|
|
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w0 , 15) ,_mm256_slli_epi32( w1 , 17 ) ) ) );
|
|
_mm256_storeu_si256(out + 10, _mm256_and_si256 ( mask, _mm256_srli_epi32( w1 , 6) ) );
|
|
w0 = _mm256_lddqu_si256 (compressed + 8);
|
|
_mm256_storeu_si256(out + 11,
|
|
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w1 , 29) ,_mm256_slli_epi32( w0 , 3 ) ) ) );
|
|
w1 = _mm256_lddqu_si256 (compressed + 9);
|
|
_mm256_storeu_si256(out + 12,
|
|
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w0 , 20) ,_mm256_slli_epi32( w1 , 12 ) ) ) );
|
|
w0 = _mm256_lddqu_si256 (compressed + 10);
|
|
_mm256_storeu_si256(out + 13,
|
|
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w1 , 11) ,_mm256_slli_epi32( w0 , 21 ) ) ) );
|
|
_mm256_storeu_si256(out + 14, _mm256_and_si256 ( mask, _mm256_srli_epi32( w0 , 2) ) );
|
|
w1 = _mm256_lddqu_si256 (compressed + 11);
|
|
_mm256_storeu_si256(out + 15,
|
|
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w0 , 25) ,_mm256_slli_epi32( w1 , 7 ) ) ) );
|
|
w0 = _mm256_lddqu_si256 (compressed + 12);
|
|
_mm256_storeu_si256(out + 16,
|
|
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w1 , 16) ,_mm256_slli_epi32( w0 , 16 ) ) ) );
|
|
_mm256_storeu_si256(out + 17, _mm256_and_si256 ( mask, _mm256_srli_epi32( w0 , 7) ) );
|
|
w1 = _mm256_lddqu_si256 (compressed + 13);
|
|
_mm256_storeu_si256(out + 18,
|
|
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w0 , 30) ,_mm256_slli_epi32( w1 , 2 ) ) ) );
|
|
w0 = _mm256_lddqu_si256 (compressed + 14);
|
|
_mm256_storeu_si256(out + 19,
|
|
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w1 , 21) ,_mm256_slli_epi32( w0 , 11 ) ) ) );
|
|
w1 = _mm256_lddqu_si256 (compressed + 15);
|
|
_mm256_storeu_si256(out + 20,
|
|
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w0 , 12) ,_mm256_slli_epi32( w1 , 20 ) ) ) );
|
|
_mm256_storeu_si256(out + 21, _mm256_and_si256 ( mask, _mm256_srli_epi32( w1 , 3) ) );
|
|
w0 = _mm256_lddqu_si256 (compressed + 16);
|
|
_mm256_storeu_si256(out + 22,
|
|
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w1 , 26) ,_mm256_slli_epi32( w0 , 6 ) ) ) );
|
|
w1 = _mm256_lddqu_si256 (compressed + 17);
|
|
_mm256_storeu_si256(out + 23,
|
|
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w0 , 17) ,_mm256_slli_epi32( w1 , 15 ) ) ) );
|
|
_mm256_storeu_si256(out + 24, _mm256_and_si256 ( mask, _mm256_srli_epi32( w1 , 8) ) );
|
|
w0 = _mm256_lddqu_si256 (compressed + 18);
|
|
_mm256_storeu_si256(out + 25,
|
|
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w1 , 31) ,_mm256_slli_epi32( w0 , 1 ) ) ) );
|
|
w1 = _mm256_lddqu_si256 (compressed + 19);
|
|
_mm256_storeu_si256(out + 26,
|
|
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w0 , 22) ,_mm256_slli_epi32( w1 , 10 ) ) ) );
|
|
w0 = _mm256_lddqu_si256 (compressed + 20);
|
|
_mm256_storeu_si256(out + 27,
|
|
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w1 , 13) ,_mm256_slli_epi32( w0 , 19 ) ) ) );
|
|
_mm256_storeu_si256(out + 28, _mm256_and_si256 ( mask, _mm256_srli_epi32( w0 , 4) ) );
|
|
w1 = _mm256_lddqu_si256 (compressed + 21);
|
|
_mm256_storeu_si256(out + 29,
|
|
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w0 , 27) ,_mm256_slli_epi32( w1 , 5 ) ) ) );
|
|
w0 = _mm256_lddqu_si256 (compressed + 22);
|
|
_mm256_storeu_si256(out + 30,
|
|
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w1 , 18) ,_mm256_slli_epi32( w0 , 14 ) ) ) );
|
|
_mm256_storeu_si256(out + 31, _mm256_srli_epi32( w0 , 9) );
|
|
}
|
|
|
|
|
|
/* we packed 256 24-bit values, touching 24 256-bit words, using 384 bytes */
|
|
static void avxunpackblock24(const __m256i * compressed, uint32_t * pout) {
|
|
/* we are going to access 24 256-bit words */
|
|
__m256i w0, w1;
|
|
__m256i * out = (__m256i *) pout;
|
|
const __m256i mask = _mm256_set1_epi32(16777215);
|
|
w0 = _mm256_lddqu_si256 (compressed);
|
|
_mm256_storeu_si256(out + 0, _mm256_and_si256 ( mask, w0 ) );
|
|
w1 = _mm256_lddqu_si256 (compressed + 1);
|
|
_mm256_storeu_si256(out + 1,
|
|
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w0 , 24) ,_mm256_slli_epi32( w1 , 8 ) ) ) );
|
|
w0 = _mm256_lddqu_si256 (compressed + 2);
|
|
_mm256_storeu_si256(out + 2,
|
|
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w1 , 16) ,_mm256_slli_epi32( w0 , 16 ) ) ) );
|
|
_mm256_storeu_si256(out + 3, _mm256_srli_epi32( w0 , 8) );
|
|
w1 = _mm256_lddqu_si256 (compressed + 3);
|
|
_mm256_storeu_si256(out + 4, _mm256_and_si256 ( mask, w1 ) );
|
|
w0 = _mm256_lddqu_si256 (compressed + 4);
|
|
_mm256_storeu_si256(out + 5,
|
|
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w1 , 24) ,_mm256_slli_epi32( w0 , 8 ) ) ) );
|
|
w1 = _mm256_lddqu_si256 (compressed + 5);
|
|
_mm256_storeu_si256(out + 6,
|
|
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w0 , 16) ,_mm256_slli_epi32( w1 , 16 ) ) ) );
|
|
_mm256_storeu_si256(out + 7, _mm256_srli_epi32( w1 , 8) );
|
|
w0 = _mm256_lddqu_si256 (compressed + 6);
|
|
_mm256_storeu_si256(out + 8, _mm256_and_si256 ( mask, w0 ) );
|
|
w1 = _mm256_lddqu_si256 (compressed + 7);
|
|
_mm256_storeu_si256(out + 9,
|
|
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w0 , 24) ,_mm256_slli_epi32( w1 , 8 ) ) ) );
|
|
w0 = _mm256_lddqu_si256 (compressed + 8);
|
|
_mm256_storeu_si256(out + 10,
|
|
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w1 , 16) ,_mm256_slli_epi32( w0 , 16 ) ) ) );
|
|
_mm256_storeu_si256(out + 11, _mm256_srli_epi32( w0 , 8) );
|
|
w1 = _mm256_lddqu_si256 (compressed + 9);
|
|
_mm256_storeu_si256(out + 12, _mm256_and_si256 ( mask, w1 ) );
|
|
w0 = _mm256_lddqu_si256 (compressed + 10);
|
|
_mm256_storeu_si256(out + 13,
|
|
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w1 , 24) ,_mm256_slli_epi32( w0 , 8 ) ) ) );
|
|
w1 = _mm256_lddqu_si256 (compressed + 11);
|
|
_mm256_storeu_si256(out + 14,
|
|
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w0 , 16) ,_mm256_slli_epi32( w1 , 16 ) ) ) );
|
|
_mm256_storeu_si256(out + 15, _mm256_srli_epi32( w1 , 8) );
|
|
w0 = _mm256_lddqu_si256 (compressed + 12);
|
|
_mm256_storeu_si256(out + 16, _mm256_and_si256 ( mask, w0 ) );
|
|
w1 = _mm256_lddqu_si256 (compressed + 13);
|
|
_mm256_storeu_si256(out + 17,
|
|
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w0 , 24) ,_mm256_slli_epi32( w1 , 8 ) ) ) );
|
|
w0 = _mm256_lddqu_si256 (compressed + 14);
|
|
_mm256_storeu_si256(out + 18,
|
|
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w1 , 16) ,_mm256_slli_epi32( w0 , 16 ) ) ) );
|
|
_mm256_storeu_si256(out + 19, _mm256_srli_epi32( w0 , 8) );
|
|
w1 = _mm256_lddqu_si256 (compressed + 15);
|
|
_mm256_storeu_si256(out + 20, _mm256_and_si256 ( mask, w1 ) );
|
|
w0 = _mm256_lddqu_si256 (compressed + 16);
|
|
_mm256_storeu_si256(out + 21,
|
|
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w1 , 24) ,_mm256_slli_epi32( w0 , 8 ) ) ) );
|
|
w1 = _mm256_lddqu_si256 (compressed + 17);
|
|
_mm256_storeu_si256(out + 22,
|
|
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w0 , 16) ,_mm256_slli_epi32( w1 , 16 ) ) ) );
|
|
_mm256_storeu_si256(out + 23, _mm256_srli_epi32( w1 , 8) );
|
|
w0 = _mm256_lddqu_si256 (compressed + 18);
|
|
_mm256_storeu_si256(out + 24, _mm256_and_si256 ( mask, w0 ) );
|
|
w1 = _mm256_lddqu_si256 (compressed + 19);
|
|
_mm256_storeu_si256(out + 25,
|
|
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w0 , 24) ,_mm256_slli_epi32( w1 , 8 ) ) ) );
|
|
w0 = _mm256_lddqu_si256 (compressed + 20);
|
|
_mm256_storeu_si256(out + 26,
|
|
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w1 , 16) ,_mm256_slli_epi32( w0 , 16 ) ) ) );
|
|
_mm256_storeu_si256(out + 27, _mm256_srli_epi32( w0 , 8) );
|
|
w1 = _mm256_lddqu_si256 (compressed + 21);
|
|
_mm256_storeu_si256(out + 28, _mm256_and_si256 ( mask, w1 ) );
|
|
w0 = _mm256_lddqu_si256 (compressed + 22);
|
|
_mm256_storeu_si256(out + 29,
|
|
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w1 , 24) ,_mm256_slli_epi32( w0 , 8 ) ) ) );
|
|
w1 = _mm256_lddqu_si256 (compressed + 23);
|
|
_mm256_storeu_si256(out + 30,
|
|
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w0 , 16) ,_mm256_slli_epi32( w1 , 16 ) ) ) );
|
|
_mm256_storeu_si256(out + 31, _mm256_srli_epi32( w1 , 8) );
|
|
}
|
|
|
|
|
|
/* we packed 256 25-bit values, touching 25 256-bit words, using 400 bytes */
|
|
static void avxunpackblock25(const __m256i * compressed, uint32_t * pout) {
|
|
/* we are going to access 25 256-bit words */
|
|
__m256i w0, w1;
|
|
__m256i * out = (__m256i *) pout;
|
|
const __m256i mask = _mm256_set1_epi32(33554431);
|
|
w0 = _mm256_lddqu_si256 (compressed);
|
|
_mm256_storeu_si256(out + 0, _mm256_and_si256 ( mask, w0 ) );
|
|
w1 = _mm256_lddqu_si256 (compressed + 1);
|
|
_mm256_storeu_si256(out + 1,
|
|
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w0 , 25) ,_mm256_slli_epi32( w1 , 7 ) ) ) );
|
|
w0 = _mm256_lddqu_si256 (compressed + 2);
|
|
_mm256_storeu_si256(out + 2,
|
|
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w1 , 18) ,_mm256_slli_epi32( w0 , 14 ) ) ) );
|
|
w1 = _mm256_lddqu_si256 (compressed + 3);
|
|
_mm256_storeu_si256(out + 3,
|
|
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w0 , 11) ,_mm256_slli_epi32( w1 , 21 ) ) ) );
|
|
_mm256_storeu_si256(out + 4, _mm256_and_si256 ( mask, _mm256_srli_epi32( w1 , 4) ) );
|
|
w0 = _mm256_lddqu_si256 (compressed + 4);
|
|
_mm256_storeu_si256(out + 5,
|
|
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w1 , 29) ,_mm256_slli_epi32( w0 , 3 ) ) ) );
|
|
w1 = _mm256_lddqu_si256 (compressed + 5);
|
|
_mm256_storeu_si256(out + 6,
|
|
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w0 , 22) ,_mm256_slli_epi32( w1 , 10 ) ) ) );
|
|
w0 = _mm256_lddqu_si256 (compressed + 6);
|
|
_mm256_storeu_si256(out + 7,
|
|
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w1 , 15) ,_mm256_slli_epi32( w0 , 17 ) ) ) );
|
|
w1 = _mm256_lddqu_si256 (compressed + 7);
|
|
_mm256_storeu_si256(out + 8,
|
|
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w0 , 8) ,_mm256_slli_epi32( w1 , 24 ) ) ) );
|
|
_mm256_storeu_si256(out + 9, _mm256_and_si256 ( mask, _mm256_srli_epi32( w1 , 1) ) );
|
|
w0 = _mm256_lddqu_si256 (compressed + 8);
|
|
_mm256_storeu_si256(out + 10,
|
|
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w1 , 26) ,_mm256_slli_epi32( w0 , 6 ) ) ) );
|
|
w1 = _mm256_lddqu_si256 (compressed + 9);
|
|
_mm256_storeu_si256(out + 11,
|
|
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w0 , 19) ,_mm256_slli_epi32( w1 , 13 ) ) ) );
|
|
w0 = _mm256_lddqu_si256 (compressed + 10);
|
|
_mm256_storeu_si256(out + 12,
|
|
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w1 , 12) ,_mm256_slli_epi32( w0 , 20 ) ) ) );
|
|
_mm256_storeu_si256(out + 13, _mm256_and_si256 ( mask, _mm256_srli_epi32( w0 , 5) ) );
|
|
w1 = _mm256_lddqu_si256 (compressed + 11);
|
|
_mm256_storeu_si256(out + 14,
|
|
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w0 , 30) ,_mm256_slli_epi32( w1 , 2 ) ) ) );
|
|
w0 = _mm256_lddqu_si256 (compressed + 12);
|
|
_mm256_storeu_si256(out + 15,
|
|
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w1 , 23) ,_mm256_slli_epi32( w0 , 9 ) ) ) );
|
|
w1 = _mm256_lddqu_si256 (compressed + 13);
|
|
_mm256_storeu_si256(out + 16,
|
|
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w0 , 16) ,_mm256_slli_epi32( w1 , 16 ) ) ) );
|
|
w0 = _mm256_lddqu_si256 (compressed + 14);
|
|
_mm256_storeu_si256(out + 17,
|
|
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w1 , 9) ,_mm256_slli_epi32( w0 , 23 ) ) ) );
|
|
_mm256_storeu_si256(out + 18, _mm256_and_si256 ( mask, _mm256_srli_epi32( w0 , 2) ) );
|
|
w1 = _mm256_lddqu_si256 (compressed + 15);
|
|
_mm256_storeu_si256(out + 19,
|
|
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w0 , 27) ,_mm256_slli_epi32( w1 , 5 ) ) ) );
|
|
w0 = _mm256_lddqu_si256 (compressed + 16);
|
|
_mm256_storeu_si256(out + 20,
|
|
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w1 , 20) ,_mm256_slli_epi32( w0 , 12 ) ) ) );
|
|
w1 = _mm256_lddqu_si256 (compressed + 17);
|
|
_mm256_storeu_si256(out + 21,
|
|
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w0 , 13) ,_mm256_slli_epi32( w1 , 19 ) ) ) );
|
|
_mm256_storeu_si256(out + 22, _mm256_and_si256 ( mask, _mm256_srli_epi32( w1 , 6) ) );
|
|
w0 = _mm256_lddqu_si256 (compressed + 18);
|
|
_mm256_storeu_si256(out + 23,
|
|
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w1 , 31) ,_mm256_slli_epi32( w0 , 1 ) ) ) );
|
|
w1 = _mm256_lddqu_si256 (compressed + 19);
|
|
_mm256_storeu_si256(out + 24,
|
|
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w0 , 24) ,_mm256_slli_epi32( w1 , 8 ) ) ) );
|
|
w0 = _mm256_lddqu_si256 (compressed + 20);
|
|
_mm256_storeu_si256(out + 25,
|
|
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w1 , 17) ,_mm256_slli_epi32( w0 , 15 ) ) ) );
|
|
w1 = _mm256_lddqu_si256 (compressed + 21);
|
|
_mm256_storeu_si256(out + 26,
|
|
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w0 , 10) ,_mm256_slli_epi32( w1 , 22 ) ) ) );
|
|
_mm256_storeu_si256(out + 27, _mm256_and_si256 ( mask, _mm256_srli_epi32( w1 , 3) ) );
|
|
w0 = _mm256_lddqu_si256 (compressed + 22);
|
|
_mm256_storeu_si256(out + 28,
|
|
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w1 , 28) ,_mm256_slli_epi32( w0 , 4 ) ) ) );
|
|
w1 = _mm256_lddqu_si256 (compressed + 23);
|
|
_mm256_storeu_si256(out + 29,
|
|
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w0 , 21) ,_mm256_slli_epi32( w1 , 11 ) ) ) );
|
|
w0 = _mm256_lddqu_si256 (compressed + 24);
|
|
_mm256_storeu_si256(out + 30,
|
|
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w1 , 14) ,_mm256_slli_epi32( w0 , 18 ) ) ) );
|
|
_mm256_storeu_si256(out + 31, _mm256_srli_epi32( w0 , 7) );
|
|
}
|
|
|
|
|
|
/* we packed 256 26-bit values, touching 26 256-bit words, using 416 bytes */
|
|
static void avxunpackblock26(const __m256i * compressed, uint32_t * pout) {
|
|
/* we are going to access 26 256-bit words */
|
|
__m256i w0, w1;
|
|
__m256i * out = (__m256i *) pout;
|
|
const __m256i mask = _mm256_set1_epi32(67108863);
|
|
w0 = _mm256_lddqu_si256 (compressed);
|
|
_mm256_storeu_si256(out + 0, _mm256_and_si256 ( mask, w0 ) );
|
|
w1 = _mm256_lddqu_si256 (compressed + 1);
|
|
_mm256_storeu_si256(out + 1,
|
|
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w0 , 26) ,_mm256_slli_epi32( w1 , 6 ) ) ) );
|
|
w0 = _mm256_lddqu_si256 (compressed + 2);
|
|
_mm256_storeu_si256(out + 2,
|
|
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w1 , 20) ,_mm256_slli_epi32( w0 , 12 ) ) ) );
|
|
w1 = _mm256_lddqu_si256 (compressed + 3);
|
|
_mm256_storeu_si256(out + 3,
|
|
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w0 , 14) ,_mm256_slli_epi32( w1 , 18 ) ) ) );
|
|
w0 = _mm256_lddqu_si256 (compressed + 4);
|
|
_mm256_storeu_si256(out + 4,
|
|
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w1 , 8) ,_mm256_slli_epi32( w0 , 24 ) ) ) );
|
|
_mm256_storeu_si256(out + 5, _mm256_and_si256 ( mask, _mm256_srli_epi32( w0 , 2) ) );
|
|
w1 = _mm256_lddqu_si256 (compressed + 5);
|
|
_mm256_storeu_si256(out + 6,
|
|
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w0 , 28) ,_mm256_slli_epi32( w1 , 4 ) ) ) );
|
|
w0 = _mm256_lddqu_si256 (compressed + 6);
|
|
_mm256_storeu_si256(out + 7,
|
|
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w1 , 22) ,_mm256_slli_epi32( w0 , 10 ) ) ) );
|
|
w1 = _mm256_lddqu_si256 (compressed + 7);
|
|
_mm256_storeu_si256(out + 8,
|
|
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w0 , 16) ,_mm256_slli_epi32( w1 , 16 ) ) ) );
|
|
w0 = _mm256_lddqu_si256 (compressed + 8);
|
|
_mm256_storeu_si256(out + 9,
|
|
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w1 , 10) ,_mm256_slli_epi32( w0 , 22 ) ) ) );
|
|
_mm256_storeu_si256(out + 10, _mm256_and_si256 ( mask, _mm256_srli_epi32( w0 , 4) ) );
|
|
w1 = _mm256_lddqu_si256 (compressed + 9);
|
|
_mm256_storeu_si256(out + 11,
|
|
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w0 , 30) ,_mm256_slli_epi32( w1 , 2 ) ) ) );
|
|
w0 = _mm256_lddqu_si256 (compressed + 10);
|
|
_mm256_storeu_si256(out + 12,
|
|
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w1 , 24) ,_mm256_slli_epi32( w0 , 8 ) ) ) );
|
|
w1 = _mm256_lddqu_si256 (compressed + 11);
|
|
_mm256_storeu_si256(out + 13,
|
|
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w0 , 18) ,_mm256_slli_epi32( w1 , 14 ) ) ) );
|
|
w0 = _mm256_lddqu_si256 (compressed + 12);
|
|
_mm256_storeu_si256(out + 14,
|
|
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w1 , 12) ,_mm256_slli_epi32( w0 , 20 ) ) ) );
|
|
_mm256_storeu_si256(out + 15, _mm256_srli_epi32( w0 , 6) );
|
|
w1 = _mm256_lddqu_si256 (compressed + 13);
|
|
_mm256_storeu_si256(out + 16, _mm256_and_si256 ( mask, w1 ) );
|
|
w0 = _mm256_lddqu_si256 (compressed + 14);
|
|
_mm256_storeu_si256(out + 17,
|
|
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w1 , 26) ,_mm256_slli_epi32( w0 , 6 ) ) ) );
|
|
w1 = _mm256_lddqu_si256 (compressed + 15);
|
|
_mm256_storeu_si256(out + 18,
|
|
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w0 , 20) ,_mm256_slli_epi32( w1 , 12 ) ) ) );
|
|
w0 = _mm256_lddqu_si256 (compressed + 16);
|
|
_mm256_storeu_si256(out + 19,
|
|
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w1 , 14) ,_mm256_slli_epi32( w0 , 18 ) ) ) );
|
|
w1 = _mm256_lddqu_si256 (compressed + 17);
|
|
_mm256_storeu_si256(out + 20,
|
|
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w0 , 8) ,_mm256_slli_epi32( w1 , 24 ) ) ) );
|
|
_mm256_storeu_si256(out + 21, _mm256_and_si256 ( mask, _mm256_srli_epi32( w1 , 2) ) );
|
|
w0 = _mm256_lddqu_si256 (compressed + 18);
|
|
_mm256_storeu_si256(out + 22,
|
|
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w1 , 28) ,_mm256_slli_epi32( w0 , 4 ) ) ) );
|
|
w1 = _mm256_lddqu_si256 (compressed + 19);
|
|
_mm256_storeu_si256(out + 23,
|
|
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w0 , 22) ,_mm256_slli_epi32( w1 , 10 ) ) ) );
|
|
w0 = _mm256_lddqu_si256 (compressed + 20);
|
|
_mm256_storeu_si256(out + 24,
|
|
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w1 , 16) ,_mm256_slli_epi32( w0 , 16 ) ) ) );
|
|
w1 = _mm256_lddqu_si256 (compressed + 21);
|
|
_mm256_storeu_si256(out + 25,
|
|
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w0 , 10) ,_mm256_slli_epi32( w1 , 22 ) ) ) );
|
|
_mm256_storeu_si256(out + 26, _mm256_and_si256 ( mask, _mm256_srli_epi32( w1 , 4) ) );
|
|
w0 = _mm256_lddqu_si256 (compressed + 22);
|
|
_mm256_storeu_si256(out + 27,
|
|
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w1 , 30) ,_mm256_slli_epi32( w0 , 2 ) ) ) );
|
|
w1 = _mm256_lddqu_si256 (compressed + 23);
|
|
_mm256_storeu_si256(out + 28,
|
|
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w0 , 24) ,_mm256_slli_epi32( w1 , 8 ) ) ) );
|
|
w0 = _mm256_lddqu_si256 (compressed + 24);
|
|
_mm256_storeu_si256(out + 29,
|
|
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w1 , 18) ,_mm256_slli_epi32( w0 , 14 ) ) ) );
|
|
w1 = _mm256_lddqu_si256 (compressed + 25);
|
|
_mm256_storeu_si256(out + 30,
|
|
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w0 , 12) ,_mm256_slli_epi32( w1 , 20 ) ) ) );
|
|
_mm256_storeu_si256(out + 31, _mm256_srli_epi32( w1 , 6) );
|
|
}
|
|
|
|
|
|
/* we packed 256 27-bit values, touching 27 256-bit words, using 432 bytes */
|
|
static void avxunpackblock27(const __m256i * compressed, uint32_t * pout) {
|
|
/* we are going to access 27 256-bit words */
|
|
__m256i w0, w1;
|
|
__m256i * out = (__m256i *) pout;
|
|
const __m256i mask = _mm256_set1_epi32(134217727);
|
|
w0 = _mm256_lddqu_si256 (compressed);
|
|
_mm256_storeu_si256(out + 0, _mm256_and_si256 ( mask, w0 ) );
|
|
w1 = _mm256_lddqu_si256 (compressed + 1);
|
|
_mm256_storeu_si256(out + 1,
|
|
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w0 , 27) ,_mm256_slli_epi32( w1 , 5 ) ) ) );
|
|
w0 = _mm256_lddqu_si256 (compressed + 2);
|
|
_mm256_storeu_si256(out + 2,
|
|
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w1 , 22) ,_mm256_slli_epi32( w0 , 10 ) ) ) );
|
|
w1 = _mm256_lddqu_si256 (compressed + 3);
|
|
_mm256_storeu_si256(out + 3,
|
|
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w0 , 17) ,_mm256_slli_epi32( w1 , 15 ) ) ) );
|
|
w0 = _mm256_lddqu_si256 (compressed + 4);
|
|
_mm256_storeu_si256(out + 4,
|
|
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w1 , 12) ,_mm256_slli_epi32( w0 , 20 ) ) ) );
|
|
w1 = _mm256_lddqu_si256 (compressed + 5);
|
|
_mm256_storeu_si256(out + 5,
|
|
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w0 , 7) ,_mm256_slli_epi32( w1 , 25 ) ) ) );
|
|
_mm256_storeu_si256(out + 6, _mm256_and_si256 ( mask, _mm256_srli_epi32( w1 , 2) ) );
|
|
w0 = _mm256_lddqu_si256 (compressed + 6);
|
|
_mm256_storeu_si256(out + 7,
|
|
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w1 , 29) ,_mm256_slli_epi32( w0 , 3 ) ) ) );
|
|
w1 = _mm256_lddqu_si256 (compressed + 7);
|
|
_mm256_storeu_si256(out + 8,
|
|
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w0 , 24) ,_mm256_slli_epi32( w1 , 8 ) ) ) );
|
|
w0 = _mm256_lddqu_si256 (compressed + 8);
|
|
_mm256_storeu_si256(out + 9,
|
|
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w1 , 19) ,_mm256_slli_epi32( w0 , 13 ) ) ) );
|
|
w1 = _mm256_lddqu_si256 (compressed + 9);
|
|
_mm256_storeu_si256(out + 10,
|
|
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w0 , 14) ,_mm256_slli_epi32( w1 , 18 ) ) ) );
|
|
w0 = _mm256_lddqu_si256 (compressed + 10);
|
|
_mm256_storeu_si256(out + 11,
|
|
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w1 , 9) ,_mm256_slli_epi32( w0 , 23 ) ) ) );
|
|
_mm256_storeu_si256(out + 12, _mm256_and_si256 ( mask, _mm256_srli_epi32( w0 , 4) ) );
|
|
w1 = _mm256_lddqu_si256 (compressed + 11);
|
|
_mm256_storeu_si256(out + 13,
|
|
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w0 , 31) ,_mm256_slli_epi32( w1 , 1 ) ) ) );
|
|
w0 = _mm256_lddqu_si256 (compressed + 12);
|
|
_mm256_storeu_si256(out + 14,
|
|
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w1 , 26) ,_mm256_slli_epi32( w0 , 6 ) ) ) );
|
|
w1 = _mm256_lddqu_si256 (compressed + 13);
|
|
_mm256_storeu_si256(out + 15,
|
|
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w0 , 21) ,_mm256_slli_epi32( w1 , 11 ) ) ) );
|
|
w0 = _mm256_lddqu_si256 (compressed + 14);
|
|
_mm256_storeu_si256(out + 16,
|
|
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w1 , 16) ,_mm256_slli_epi32( w0 , 16 ) ) ) );
|
|
w1 = _mm256_lddqu_si256 (compressed + 15);
|
|
_mm256_storeu_si256(out + 17,
|
|
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w0 , 11) ,_mm256_slli_epi32( w1 , 21 ) ) ) );
|
|
w0 = _mm256_lddqu_si256 (compressed + 16);
|
|
_mm256_storeu_si256(out + 18,
|
|
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w1 , 6) ,_mm256_slli_epi32( w0 , 26 ) ) ) );
|
|
_mm256_storeu_si256(out + 19, _mm256_and_si256 ( mask, _mm256_srli_epi32( w0 , 1) ) );
|
|
w1 = _mm256_lddqu_si256 (compressed + 17);
|
|
_mm256_storeu_si256(out + 20,
|
|
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w0 , 28) ,_mm256_slli_epi32( w1 , 4 ) ) ) );
|
|
w0 = _mm256_lddqu_si256 (compressed + 18);
|
|
_mm256_storeu_si256(out + 21,
|
|
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w1 , 23) ,_mm256_slli_epi32( w0 , 9 ) ) ) );
|
|
w1 = _mm256_lddqu_si256 (compressed + 19);
|
|
_mm256_storeu_si256(out + 22,
|
|
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w0 , 18) ,_mm256_slli_epi32( w1 , 14 ) ) ) );
|
|
w0 = _mm256_lddqu_si256 (compressed + 20);
|
|
_mm256_storeu_si256(out + 23,
|
|
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w1 , 13) ,_mm256_slli_epi32( w0 , 19 ) ) ) );
|
|
w1 = _mm256_lddqu_si256 (compressed + 21);
|
|
_mm256_storeu_si256(out + 24,
|
|
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w0 , 8) ,_mm256_slli_epi32( w1 , 24 ) ) ) );
|
|
_mm256_storeu_si256(out + 25, _mm256_and_si256 ( mask, _mm256_srli_epi32( w1 , 3) ) );
|
|
w0 = _mm256_lddqu_si256 (compressed + 22);
|
|
_mm256_storeu_si256(out + 26,
|
|
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w1 , 30) ,_mm256_slli_epi32( w0 , 2 ) ) ) );
|
|
w1 = _mm256_lddqu_si256 (compressed + 23);
|
|
_mm256_storeu_si256(out + 27,
|
|
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w0 , 25) ,_mm256_slli_epi32( w1 , 7 ) ) ) );
|
|
w0 = _mm256_lddqu_si256 (compressed + 24);
|
|
_mm256_storeu_si256(out + 28,
|
|
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w1 , 20) ,_mm256_slli_epi32( w0 , 12 ) ) ) );
|
|
w1 = _mm256_lddqu_si256 (compressed + 25);
|
|
_mm256_storeu_si256(out + 29,
|
|
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w0 , 15) ,_mm256_slli_epi32( w1 , 17 ) ) ) );
|
|
w0 = _mm256_lddqu_si256 (compressed + 26);
|
|
_mm256_storeu_si256(out + 30,
|
|
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w1 , 10) ,_mm256_slli_epi32( w0 , 22 ) ) ) );
|
|
_mm256_storeu_si256(out + 31, _mm256_srli_epi32( w0 , 5) );
|
|
}
|
|
|
|
|
|
/* we packed 256 28-bit values, touching 28 256-bit words, using 448 bytes */
|
|
static void avxunpackblock28(const __m256i * compressed, uint32_t * pout) {
|
|
/* we are going to access 28 256-bit words */
|
|
__m256i w0, w1;
|
|
__m256i * out = (__m256i *) pout;
|
|
const __m256i mask = _mm256_set1_epi32(268435455);
|
|
w0 = _mm256_lddqu_si256 (compressed);
|
|
_mm256_storeu_si256(out + 0, _mm256_and_si256 ( mask, w0 ) );
|
|
w1 = _mm256_lddqu_si256 (compressed + 1);
|
|
_mm256_storeu_si256(out + 1,
|
|
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w0 , 28) ,_mm256_slli_epi32( w1 , 4 ) ) ) );
|
|
w0 = _mm256_lddqu_si256 (compressed + 2);
|
|
_mm256_storeu_si256(out + 2,
|
|
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w1 , 24) ,_mm256_slli_epi32( w0 , 8 ) ) ) );
|
|
w1 = _mm256_lddqu_si256 (compressed + 3);
|
|
_mm256_storeu_si256(out + 3,
|
|
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w0 , 20) ,_mm256_slli_epi32( w1 , 12 ) ) ) );
|
|
w0 = _mm256_lddqu_si256 (compressed + 4);
|
|
_mm256_storeu_si256(out + 4,
|
|
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w1 , 16) ,_mm256_slli_epi32( w0 , 16 ) ) ) );
|
|
w1 = _mm256_lddqu_si256 (compressed + 5);
|
|
_mm256_storeu_si256(out + 5,
|
|
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w0 , 12) ,_mm256_slli_epi32( w1 , 20 ) ) ) );
|
|
w0 = _mm256_lddqu_si256 (compressed + 6);
|
|
_mm256_storeu_si256(out + 6,
|
|
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w1 , 8) ,_mm256_slli_epi32( w0 , 24 ) ) ) );
|
|
_mm256_storeu_si256(out + 7, _mm256_srli_epi32( w0 , 4) );
|
|
w1 = _mm256_lddqu_si256 (compressed + 7);
|
|
_mm256_storeu_si256(out + 8, _mm256_and_si256 ( mask, w1 ) );
|
|
w0 = _mm256_lddqu_si256 (compressed + 8);
|
|
_mm256_storeu_si256(out + 9,
|
|
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w1 , 28) ,_mm256_slli_epi32( w0 , 4 ) ) ) );
|
|
w1 = _mm256_lddqu_si256 (compressed + 9);
|
|
_mm256_storeu_si256(out + 10,
|
|
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w0 , 24) ,_mm256_slli_epi32( w1 , 8 ) ) ) );
|
|
w0 = _mm256_lddqu_si256 (compressed + 10);
|
|
_mm256_storeu_si256(out + 11,
|
|
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w1 , 20) ,_mm256_slli_epi32( w0 , 12 ) ) ) );
|
|
w1 = _mm256_lddqu_si256 (compressed + 11);
|
|
_mm256_storeu_si256(out + 12,
|
|
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w0 , 16) ,_mm256_slli_epi32( w1 , 16 ) ) ) );
|
|
w0 = _mm256_lddqu_si256 (compressed + 12);
|
|
_mm256_storeu_si256(out + 13,
|
|
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w1 , 12) ,_mm256_slli_epi32( w0 , 20 ) ) ) );
|
|
w1 = _mm256_lddqu_si256 (compressed + 13);
|
|
_mm256_storeu_si256(out + 14,
|
|
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w0 , 8) ,_mm256_slli_epi32( w1 , 24 ) ) ) );
|
|
_mm256_storeu_si256(out + 15, _mm256_srli_epi32( w1 , 4) );
|
|
w0 = _mm256_lddqu_si256 (compressed + 14);
|
|
_mm256_storeu_si256(out + 16, _mm256_and_si256 ( mask, w0 ) );
|
|
w1 = _mm256_lddqu_si256 (compressed + 15);
|
|
_mm256_storeu_si256(out + 17,
|
|
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w0 , 28) ,_mm256_slli_epi32( w1 , 4 ) ) ) );
|
|
w0 = _mm256_lddqu_si256 (compressed + 16);
|
|
_mm256_storeu_si256(out + 18,
|
|
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w1 , 24) ,_mm256_slli_epi32( w0 , 8 ) ) ) );
|
|
w1 = _mm256_lddqu_si256 (compressed + 17);
|
|
_mm256_storeu_si256(out + 19,
|
|
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w0 , 20) ,_mm256_slli_epi32( w1 , 12 ) ) ) );
|
|
w0 = _mm256_lddqu_si256 (compressed + 18);
|
|
_mm256_storeu_si256(out + 20,
|
|
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w1 , 16) ,_mm256_slli_epi32( w0 , 16 ) ) ) );
|
|
w1 = _mm256_lddqu_si256 (compressed + 19);
|
|
_mm256_storeu_si256(out + 21,
|
|
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w0 , 12) ,_mm256_slli_epi32( w1 , 20 ) ) ) );
|
|
w0 = _mm256_lddqu_si256 (compressed + 20);
|
|
_mm256_storeu_si256(out + 22,
|
|
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w1 , 8) ,_mm256_slli_epi32( w0 , 24 ) ) ) );
|
|
_mm256_storeu_si256(out + 23, _mm256_srli_epi32( w0 , 4) );
|
|
w1 = _mm256_lddqu_si256 (compressed + 21);
|
|
_mm256_storeu_si256(out + 24, _mm256_and_si256 ( mask, w1 ) );
|
|
w0 = _mm256_lddqu_si256 (compressed + 22);
|
|
_mm256_storeu_si256(out + 25,
|
|
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w1 , 28) ,_mm256_slli_epi32( w0 , 4 ) ) ) );
|
|
w1 = _mm256_lddqu_si256 (compressed + 23);
|
|
_mm256_storeu_si256(out + 26,
|
|
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w0 , 24) ,_mm256_slli_epi32( w1 , 8 ) ) ) );
|
|
w0 = _mm256_lddqu_si256 (compressed + 24);
|
|
_mm256_storeu_si256(out + 27,
|
|
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w1 , 20) ,_mm256_slli_epi32( w0 , 12 ) ) ) );
|
|
w1 = _mm256_lddqu_si256 (compressed + 25);
|
|
_mm256_storeu_si256(out + 28,
|
|
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w0 , 16) ,_mm256_slli_epi32( w1 , 16 ) ) ) );
|
|
w0 = _mm256_lddqu_si256 (compressed + 26);
|
|
_mm256_storeu_si256(out + 29,
|
|
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w1 , 12) ,_mm256_slli_epi32( w0 , 20 ) ) ) );
|
|
w1 = _mm256_lddqu_si256 (compressed + 27);
|
|
_mm256_storeu_si256(out + 30,
|
|
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w0 , 8) ,_mm256_slli_epi32( w1 , 24 ) ) ) );
|
|
_mm256_storeu_si256(out + 31, _mm256_srli_epi32( w1 , 4) );
|
|
}
|
|
|
|
|
|
/* we packed 256 29-bit values, touching 29 256-bit words, using 464 bytes */
|
|
static void avxunpackblock29(const __m256i * compressed, uint32_t * pout) {
|
|
/* we are going to access 29 256-bit words */
|
|
__m256i w0, w1;
|
|
__m256i * out = (__m256i *) pout;
|
|
const __m256i mask = _mm256_set1_epi32(536870911);
|
|
w0 = _mm256_lddqu_si256 (compressed);
|
|
_mm256_storeu_si256(out + 0, _mm256_and_si256 ( mask, w0 ) );
|
|
w1 = _mm256_lddqu_si256 (compressed + 1);
|
|
_mm256_storeu_si256(out + 1,
|
|
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w0 , 29) ,_mm256_slli_epi32( w1 , 3 ) ) ) );
|
|
w0 = _mm256_lddqu_si256 (compressed + 2);
|
|
_mm256_storeu_si256(out + 2,
|
|
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w1 , 26) ,_mm256_slli_epi32( w0 , 6 ) ) ) );
|
|
w1 = _mm256_lddqu_si256 (compressed + 3);
|
|
_mm256_storeu_si256(out + 3,
|
|
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w0 , 23) ,_mm256_slli_epi32( w1 , 9 ) ) ) );
|
|
w0 = _mm256_lddqu_si256 (compressed + 4);
|
|
_mm256_storeu_si256(out + 4,
|
|
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w1 , 20) ,_mm256_slli_epi32( w0 , 12 ) ) ) );
|
|
w1 = _mm256_lddqu_si256 (compressed + 5);
|
|
_mm256_storeu_si256(out + 5,
|
|
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w0 , 17) ,_mm256_slli_epi32( w1 , 15 ) ) ) );
|
|
w0 = _mm256_lddqu_si256 (compressed + 6);
|
|
_mm256_storeu_si256(out + 6,
|
|
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w1 , 14) ,_mm256_slli_epi32( w0 , 18 ) ) ) );
|
|
w1 = _mm256_lddqu_si256 (compressed + 7);
|
|
_mm256_storeu_si256(out + 7,
|
|
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w0 , 11) ,_mm256_slli_epi32( w1 , 21 ) ) ) );
|
|
w0 = _mm256_lddqu_si256 (compressed + 8);
|
|
_mm256_storeu_si256(out + 8,
|
|
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w1 , 8) ,_mm256_slli_epi32( w0 , 24 ) ) ) );
|
|
w1 = _mm256_lddqu_si256 (compressed + 9);
|
|
_mm256_storeu_si256(out + 9,
|
|
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w0 , 5) ,_mm256_slli_epi32( w1 , 27 ) ) ) );
|
|
_mm256_storeu_si256(out + 10, _mm256_and_si256 ( mask, _mm256_srli_epi32( w1 , 2) ) );
|
|
w0 = _mm256_lddqu_si256 (compressed + 10);
|
|
_mm256_storeu_si256(out + 11,
|
|
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w1 , 31) ,_mm256_slli_epi32( w0 , 1 ) ) ) );
|
|
w1 = _mm256_lddqu_si256 (compressed + 11);
|
|
_mm256_storeu_si256(out + 12,
|
|
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w0 , 28) ,_mm256_slli_epi32( w1 , 4 ) ) ) );
|
|
w0 = _mm256_lddqu_si256 (compressed + 12);
|
|
_mm256_storeu_si256(out + 13,
|
|
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w1 , 25) ,_mm256_slli_epi32( w0 , 7 ) ) ) );
|
|
w1 = _mm256_lddqu_si256 (compressed + 13);
|
|
_mm256_storeu_si256(out + 14,
|
|
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w0 , 22) ,_mm256_slli_epi32( w1 , 10 ) ) ) );
|
|
w0 = _mm256_lddqu_si256 (compressed + 14);
|
|
_mm256_storeu_si256(out + 15,
|
|
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w1 , 19) ,_mm256_slli_epi32( w0 , 13 ) ) ) );
|
|
w1 = _mm256_lddqu_si256 (compressed + 15);
|
|
_mm256_storeu_si256(out + 16,
|
|
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w0 , 16) ,_mm256_slli_epi32( w1 , 16 ) ) ) );
|
|
w0 = _mm256_lddqu_si256 (compressed + 16);
|
|
_mm256_storeu_si256(out + 17,
|
|
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w1 , 13) ,_mm256_slli_epi32( w0 , 19 ) ) ) );
|
|
w1 = _mm256_lddqu_si256 (compressed + 17);
|
|
_mm256_storeu_si256(out + 18,
|
|
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w0 , 10) ,_mm256_slli_epi32( w1 , 22 ) ) ) );
|
|
w0 = _mm256_lddqu_si256 (compressed + 18);
|
|
_mm256_storeu_si256(out + 19,
|
|
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w1 , 7) ,_mm256_slli_epi32( w0 , 25 ) ) ) );
|
|
w1 = _mm256_lddqu_si256 (compressed + 19);
|
|
_mm256_storeu_si256(out + 20,
|
|
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w0 , 4) ,_mm256_slli_epi32( w1 , 28 ) ) ) );
|
|
_mm256_storeu_si256(out + 21, _mm256_and_si256 ( mask, _mm256_srli_epi32( w1 , 1) ) );
|
|
w0 = _mm256_lddqu_si256 (compressed + 20);
|
|
_mm256_storeu_si256(out + 22,
|
|
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w1 , 30) ,_mm256_slli_epi32( w0 , 2 ) ) ) );
|
|
w1 = _mm256_lddqu_si256 (compressed + 21);
|
|
_mm256_storeu_si256(out + 23,
|
|
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w0 , 27) ,_mm256_slli_epi32( w1 , 5 ) ) ) );
|
|
w0 = _mm256_lddqu_si256 (compressed + 22);
|
|
_mm256_storeu_si256(out + 24,
|
|
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w1 , 24) ,_mm256_slli_epi32( w0 , 8 ) ) ) );
|
|
w1 = _mm256_lddqu_si256 (compressed + 23);
|
|
_mm256_storeu_si256(out + 25,
|
|
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w0 , 21) ,_mm256_slli_epi32( w1 , 11 ) ) ) );
|
|
w0 = _mm256_lddqu_si256 (compressed + 24);
|
|
_mm256_storeu_si256(out + 26,
|
|
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w1 , 18) ,_mm256_slli_epi32( w0 , 14 ) ) ) );
|
|
w1 = _mm256_lddqu_si256 (compressed + 25);
|
|
_mm256_storeu_si256(out + 27,
|
|
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w0 , 15) ,_mm256_slli_epi32( w1 , 17 ) ) ) );
|
|
w0 = _mm256_lddqu_si256 (compressed + 26);
|
|
_mm256_storeu_si256(out + 28,
|
|
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w1 , 12) ,_mm256_slli_epi32( w0 , 20 ) ) ) );
|
|
w1 = _mm256_lddqu_si256 (compressed + 27);
|
|
_mm256_storeu_si256(out + 29,
|
|
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w0 , 9) ,_mm256_slli_epi32( w1 , 23 ) ) ) );
|
|
w0 = _mm256_lddqu_si256 (compressed + 28);
|
|
_mm256_storeu_si256(out + 30,
|
|
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w1 , 6) ,_mm256_slli_epi32( w0 , 26 ) ) ) );
|
|
_mm256_storeu_si256(out + 31, _mm256_srli_epi32( w0 , 3) );
|
|
}
|
|
|
|
|
|
/* we packed 256 30-bit values, touching 30 256-bit words, using 480 bytes */
|
|
static void avxunpackblock30(const __m256i * compressed, uint32_t * pout) {
|
|
/* we are going to access 30 256-bit words */
|
|
__m256i w0, w1;
|
|
__m256i * out = (__m256i *) pout;
|
|
const __m256i mask = _mm256_set1_epi32(1073741823);
|
|
w0 = _mm256_lddqu_si256 (compressed);
|
|
_mm256_storeu_si256(out + 0, _mm256_and_si256 ( mask, w0 ) );
|
|
w1 = _mm256_lddqu_si256 (compressed + 1);
|
|
_mm256_storeu_si256(out + 1,
|
|
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w0 , 30) ,_mm256_slli_epi32( w1 , 2 ) ) ) );
|
|
w0 = _mm256_lddqu_si256 (compressed + 2);
|
|
_mm256_storeu_si256(out + 2,
|
|
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w1 , 28) ,_mm256_slli_epi32( w0 , 4 ) ) ) );
|
|
w1 = _mm256_lddqu_si256 (compressed + 3);
|
|
_mm256_storeu_si256(out + 3,
|
|
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w0 , 26) ,_mm256_slli_epi32( w1 , 6 ) ) ) );
|
|
w0 = _mm256_lddqu_si256 (compressed + 4);
|
|
_mm256_storeu_si256(out + 4,
|
|
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w1 , 24) ,_mm256_slli_epi32( w0 , 8 ) ) ) );
|
|
w1 = _mm256_lddqu_si256 (compressed + 5);
|
|
_mm256_storeu_si256(out + 5,
|
|
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w0 , 22) ,_mm256_slli_epi32( w1 , 10 ) ) ) );
|
|
w0 = _mm256_lddqu_si256 (compressed + 6);
|
|
_mm256_storeu_si256(out + 6,
|
|
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w1 , 20) ,_mm256_slli_epi32( w0 , 12 ) ) ) );
|
|
w1 = _mm256_lddqu_si256 (compressed + 7);
|
|
_mm256_storeu_si256(out + 7,
|
|
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w0 , 18) ,_mm256_slli_epi32( w1 , 14 ) ) ) );
|
|
w0 = _mm256_lddqu_si256 (compressed + 8);
|
|
_mm256_storeu_si256(out + 8,
|
|
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w1 , 16) ,_mm256_slli_epi32( w0 , 16 ) ) ) );
|
|
w1 = _mm256_lddqu_si256 (compressed + 9);
|
|
_mm256_storeu_si256(out + 9,
|
|
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w0 , 14) ,_mm256_slli_epi32( w1 , 18 ) ) ) );
|
|
w0 = _mm256_lddqu_si256 (compressed + 10);
|
|
_mm256_storeu_si256(out + 10,
|
|
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w1 , 12) ,_mm256_slli_epi32( w0 , 20 ) ) ) );
|
|
w1 = _mm256_lddqu_si256 (compressed + 11);
|
|
_mm256_storeu_si256(out + 11,
|
|
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w0 , 10) ,_mm256_slli_epi32( w1 , 22 ) ) ) );
|
|
w0 = _mm256_lddqu_si256 (compressed + 12);
|
|
_mm256_storeu_si256(out + 12,
|
|
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w1 , 8) ,_mm256_slli_epi32( w0 , 24 ) ) ) );
|
|
w1 = _mm256_lddqu_si256 (compressed + 13);
|
|
_mm256_storeu_si256(out + 13,
|
|
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w0 , 6) ,_mm256_slli_epi32( w1 , 26 ) ) ) );
|
|
w0 = _mm256_lddqu_si256 (compressed + 14);
|
|
_mm256_storeu_si256(out + 14,
|
|
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w1 , 4) ,_mm256_slli_epi32( w0 , 28 ) ) ) );
|
|
_mm256_storeu_si256(out + 15, _mm256_srli_epi32( w0 , 2) );
|
|
w1 = _mm256_lddqu_si256 (compressed + 15);
|
|
_mm256_storeu_si256(out + 16, _mm256_and_si256 ( mask, w1 ) );
|
|
w0 = _mm256_lddqu_si256 (compressed + 16);
|
|
_mm256_storeu_si256(out + 17,
|
|
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w1 , 30) ,_mm256_slli_epi32( w0 , 2 ) ) ) );
|
|
w1 = _mm256_lddqu_si256 (compressed + 17);
|
|
_mm256_storeu_si256(out + 18,
|
|
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w0 , 28) ,_mm256_slli_epi32( w1 , 4 ) ) ) );
|
|
w0 = _mm256_lddqu_si256 (compressed + 18);
|
|
_mm256_storeu_si256(out + 19,
|
|
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w1 , 26) ,_mm256_slli_epi32( w0 , 6 ) ) ) );
|
|
w1 = _mm256_lddqu_si256 (compressed + 19);
|
|
_mm256_storeu_si256(out + 20,
|
|
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w0 , 24) ,_mm256_slli_epi32( w1 , 8 ) ) ) );
|
|
w0 = _mm256_lddqu_si256 (compressed + 20);
|
|
_mm256_storeu_si256(out + 21,
|
|
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w1 , 22) ,_mm256_slli_epi32( w0 , 10 ) ) ) );
|
|
w1 = _mm256_lddqu_si256 (compressed + 21);
|
|
_mm256_storeu_si256(out + 22,
|
|
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w0 , 20) ,_mm256_slli_epi32( w1 , 12 ) ) ) );
|
|
w0 = _mm256_lddqu_si256 (compressed + 22);
|
|
_mm256_storeu_si256(out + 23,
|
|
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w1 , 18) ,_mm256_slli_epi32( w0 , 14 ) ) ) );
|
|
w1 = _mm256_lddqu_si256 (compressed + 23);
|
|
_mm256_storeu_si256(out + 24,
|
|
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w0 , 16) ,_mm256_slli_epi32( w1 , 16 ) ) ) );
|
|
w0 = _mm256_lddqu_si256 (compressed + 24);
|
|
_mm256_storeu_si256(out + 25,
|
|
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w1 , 14) ,_mm256_slli_epi32( w0 , 18 ) ) ) );
|
|
w1 = _mm256_lddqu_si256 (compressed + 25);
|
|
_mm256_storeu_si256(out + 26,
|
|
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w0 , 12) ,_mm256_slli_epi32( w1 , 20 ) ) ) );
|
|
w0 = _mm256_lddqu_si256 (compressed + 26);
|
|
_mm256_storeu_si256(out + 27,
|
|
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w1 , 10) ,_mm256_slli_epi32( w0 , 22 ) ) ) );
|
|
w1 = _mm256_lddqu_si256 (compressed + 27);
|
|
_mm256_storeu_si256(out + 28,
|
|
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w0 , 8) ,_mm256_slli_epi32( w1 , 24 ) ) ) );
|
|
w0 = _mm256_lddqu_si256 (compressed + 28);
|
|
_mm256_storeu_si256(out + 29,
|
|
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w1 , 6) ,_mm256_slli_epi32( w0 , 26 ) ) ) );
|
|
w1 = _mm256_lddqu_si256 (compressed + 29);
|
|
_mm256_storeu_si256(out + 30,
|
|
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w0 , 4) ,_mm256_slli_epi32( w1 , 28 ) ) ) );
|
|
_mm256_storeu_si256(out + 31, _mm256_srli_epi32( w1 , 2) );
|
|
}
|
|
|
|
|
|
/* we packed 256 31-bit values, touching 31 256-bit words, using 496 bytes */
|
|
static void avxunpackblock31(const __m256i * compressed, uint32_t * pout) {
|
|
/* we are going to access 31 256-bit words */
|
|
__m256i w0, w1;
|
|
__m256i * out = (__m256i *) pout;
|
|
const __m256i mask = _mm256_set1_epi32(2147483647);
|
|
w0 = _mm256_lddqu_si256 (compressed);
|
|
_mm256_storeu_si256(out + 0, _mm256_and_si256 ( mask, w0 ) );
|
|
w1 = _mm256_lddqu_si256 (compressed + 1);
|
|
_mm256_storeu_si256(out + 1,
|
|
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w0 , 31) ,_mm256_slli_epi32( w1 , 1 ) ) ) );
|
|
w0 = _mm256_lddqu_si256 (compressed + 2);
|
|
_mm256_storeu_si256(out + 2,
|
|
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w1 , 30) ,_mm256_slli_epi32( w0 , 2 ) ) ) );
|
|
w1 = _mm256_lddqu_si256 (compressed + 3);
|
|
_mm256_storeu_si256(out + 3,
|
|
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w0 , 29) ,_mm256_slli_epi32( w1 , 3 ) ) ) );
|
|
w0 = _mm256_lddqu_si256 (compressed + 4);
|
|
_mm256_storeu_si256(out + 4,
|
|
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w1 , 28) ,_mm256_slli_epi32( w0 , 4 ) ) ) );
|
|
w1 = _mm256_lddqu_si256 (compressed + 5);
|
|
_mm256_storeu_si256(out + 5,
|
|
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w0 , 27) ,_mm256_slli_epi32( w1 , 5 ) ) ) );
|
|
w0 = _mm256_lddqu_si256 (compressed + 6);
|
|
_mm256_storeu_si256(out + 6,
|
|
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w1 , 26) ,_mm256_slli_epi32( w0 , 6 ) ) ) );
|
|
w1 = _mm256_lddqu_si256 (compressed + 7);
|
|
_mm256_storeu_si256(out + 7,
|
|
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w0 , 25) ,_mm256_slli_epi32( w1 , 7 ) ) ) );
|
|
w0 = _mm256_lddqu_si256 (compressed + 8);
|
|
_mm256_storeu_si256(out + 8,
|
|
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w1 , 24) ,_mm256_slli_epi32( w0 , 8 ) ) ) );
|
|
w1 = _mm256_lddqu_si256 (compressed + 9);
|
|
_mm256_storeu_si256(out + 9,
|
|
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w0 , 23) ,_mm256_slli_epi32( w1 , 9 ) ) ) );
|
|
w0 = _mm256_lddqu_si256 (compressed + 10);
|
|
_mm256_storeu_si256(out + 10,
|
|
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w1 , 22) ,_mm256_slli_epi32( w0 , 10 ) ) ) );
|
|
w1 = _mm256_lddqu_si256 (compressed + 11);
|
|
_mm256_storeu_si256(out + 11,
|
|
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w0 , 21) ,_mm256_slli_epi32( w1 , 11 ) ) ) );
|
|
w0 = _mm256_lddqu_si256 (compressed + 12);
|
|
_mm256_storeu_si256(out + 12,
|
|
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w1 , 20) ,_mm256_slli_epi32( w0 , 12 ) ) ) );
|
|
w1 = _mm256_lddqu_si256 (compressed + 13);
|
|
_mm256_storeu_si256(out + 13,
|
|
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w0 , 19) ,_mm256_slli_epi32( w1 , 13 ) ) ) );
|
|
w0 = _mm256_lddqu_si256 (compressed + 14);
|
|
_mm256_storeu_si256(out + 14,
|
|
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w1 , 18) ,_mm256_slli_epi32( w0 , 14 ) ) ) );
|
|
w1 = _mm256_lddqu_si256 (compressed + 15);
|
|
_mm256_storeu_si256(out + 15,
|
|
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w0 , 17) ,_mm256_slli_epi32( w1 , 15 ) ) ) );
|
|
w0 = _mm256_lddqu_si256 (compressed + 16);
|
|
_mm256_storeu_si256(out + 16,
|
|
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w1 , 16) ,_mm256_slli_epi32( w0 , 16 ) ) ) );
|
|
w1 = _mm256_lddqu_si256 (compressed + 17);
|
|
_mm256_storeu_si256(out + 17,
|
|
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w0 , 15) ,_mm256_slli_epi32( w1 , 17 ) ) ) );
|
|
w0 = _mm256_lddqu_si256 (compressed + 18);
|
|
_mm256_storeu_si256(out + 18,
|
|
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w1 , 14) ,_mm256_slli_epi32( w0 , 18 ) ) ) );
|
|
w1 = _mm256_lddqu_si256 (compressed + 19);
|
|
_mm256_storeu_si256(out + 19,
|
|
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w0 , 13) ,_mm256_slli_epi32( w1 , 19 ) ) ) );
|
|
w0 = _mm256_lddqu_si256 (compressed + 20);
|
|
_mm256_storeu_si256(out + 20,
|
|
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w1 , 12) ,_mm256_slli_epi32( w0 , 20 ) ) ) );
|
|
w1 = _mm256_lddqu_si256 (compressed + 21);
|
|
_mm256_storeu_si256(out + 21,
|
|
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w0 , 11) ,_mm256_slli_epi32( w1 , 21 ) ) ) );
|
|
w0 = _mm256_lddqu_si256 (compressed + 22);
|
|
_mm256_storeu_si256(out + 22,
|
|
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w1 , 10) ,_mm256_slli_epi32( w0 , 22 ) ) ) );
|
|
w1 = _mm256_lddqu_si256 (compressed + 23);
|
|
_mm256_storeu_si256(out + 23,
|
|
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w0 , 9) ,_mm256_slli_epi32( w1 , 23 ) ) ) );
|
|
w0 = _mm256_lddqu_si256 (compressed + 24);
|
|
_mm256_storeu_si256(out + 24,
|
|
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w1 , 8) ,_mm256_slli_epi32( w0 , 24 ) ) ) );
|
|
w1 = _mm256_lddqu_si256 (compressed + 25);
|
|
_mm256_storeu_si256(out + 25,
|
|
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w0 , 7) ,_mm256_slli_epi32( w1 , 25 ) ) ) );
|
|
w0 = _mm256_lddqu_si256 (compressed + 26);
|
|
_mm256_storeu_si256(out + 26,
|
|
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w1 , 6) ,_mm256_slli_epi32( w0 , 26 ) ) ) );
|
|
w1 = _mm256_lddqu_si256 (compressed + 27);
|
|
_mm256_storeu_si256(out + 27,
|
|
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w0 , 5) ,_mm256_slli_epi32( w1 , 27 ) ) ) );
|
|
w0 = _mm256_lddqu_si256 (compressed + 28);
|
|
_mm256_storeu_si256(out + 28,
|
|
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w1 , 4) ,_mm256_slli_epi32( w0 , 28 ) ) ) );
|
|
w1 = _mm256_lddqu_si256 (compressed + 29);
|
|
_mm256_storeu_si256(out + 29,
|
|
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w0 , 3) ,_mm256_slli_epi32( w1 , 29 ) ) ) );
|
|
w0 = _mm256_lddqu_si256 (compressed + 30);
|
|
_mm256_storeu_si256(out + 30,
|
|
_mm256_and_si256 ( mask, _mm256_or_si256 (_mm256_srli_epi32( w1 , 2) ,_mm256_slli_epi32( w0 , 30 ) ) ) );
|
|
_mm256_storeu_si256(out + 31, _mm256_srli_epi32( w0 , 1) );
|
|
}
|
|
|
|
|
|
/* we packed 256 32-bit values, touching 32 256-bit words, using 512 bytes */
|
|
static void avxunpackblock32(const __m256i * compressed, uint32_t * pout) {
|
|
/* we are going to access 32 256-bit words */
|
|
__m256i w0, w1;
|
|
__m256i * out = (__m256i *) pout;
|
|
w0 = _mm256_lddqu_si256 (compressed);
|
|
_mm256_storeu_si256(out + 0, w0 );
|
|
w1 = _mm256_lddqu_si256 (compressed + 1);
|
|
_mm256_storeu_si256(out + 1, w1 );
|
|
w0 = _mm256_lddqu_si256 (compressed + 2);
|
|
_mm256_storeu_si256(out + 2, w0 );
|
|
w1 = _mm256_lddqu_si256 (compressed + 3);
|
|
_mm256_storeu_si256(out + 3, w1 );
|
|
w0 = _mm256_lddqu_si256 (compressed + 4);
|
|
_mm256_storeu_si256(out + 4, w0 );
|
|
w1 = _mm256_lddqu_si256 (compressed + 5);
|
|
_mm256_storeu_si256(out + 5, w1 );
|
|
w0 = _mm256_lddqu_si256 (compressed + 6);
|
|
_mm256_storeu_si256(out + 6, w0 );
|
|
w1 = _mm256_lddqu_si256 (compressed + 7);
|
|
_mm256_storeu_si256(out + 7, w1 );
|
|
w0 = _mm256_lddqu_si256 (compressed + 8);
|
|
_mm256_storeu_si256(out + 8, w0 );
|
|
w1 = _mm256_lddqu_si256 (compressed + 9);
|
|
_mm256_storeu_si256(out + 9, w1 );
|
|
w0 = _mm256_lddqu_si256 (compressed + 10);
|
|
_mm256_storeu_si256(out + 10, w0 );
|
|
w1 = _mm256_lddqu_si256 (compressed + 11);
|
|
_mm256_storeu_si256(out + 11, w1 );
|
|
w0 = _mm256_lddqu_si256 (compressed + 12);
|
|
_mm256_storeu_si256(out + 12, w0 );
|
|
w1 = _mm256_lddqu_si256 (compressed + 13);
|
|
_mm256_storeu_si256(out + 13, w1 );
|
|
w0 = _mm256_lddqu_si256 (compressed + 14);
|
|
_mm256_storeu_si256(out + 14, w0 );
|
|
w1 = _mm256_lddqu_si256 (compressed + 15);
|
|
_mm256_storeu_si256(out + 15, w1 );
|
|
w0 = _mm256_lddqu_si256 (compressed + 16);
|
|
_mm256_storeu_si256(out + 16, w0 );
|
|
w1 = _mm256_lddqu_si256 (compressed + 17);
|
|
_mm256_storeu_si256(out + 17, w1 );
|
|
w0 = _mm256_lddqu_si256 (compressed + 18);
|
|
_mm256_storeu_si256(out + 18, w0 );
|
|
w1 = _mm256_lddqu_si256 (compressed + 19);
|
|
_mm256_storeu_si256(out + 19, w1 );
|
|
w0 = _mm256_lddqu_si256 (compressed + 20);
|
|
_mm256_storeu_si256(out + 20, w0 );
|
|
w1 = _mm256_lddqu_si256 (compressed + 21);
|
|
_mm256_storeu_si256(out + 21, w1 );
|
|
w0 = _mm256_lddqu_si256 (compressed + 22);
|
|
_mm256_storeu_si256(out + 22, w0 );
|
|
w1 = _mm256_lddqu_si256 (compressed + 23);
|
|
_mm256_storeu_si256(out + 23, w1 );
|
|
w0 = _mm256_lddqu_si256 (compressed + 24);
|
|
_mm256_storeu_si256(out + 24, w0 );
|
|
w1 = _mm256_lddqu_si256 (compressed + 25);
|
|
_mm256_storeu_si256(out + 25, w1 );
|
|
w0 = _mm256_lddqu_si256 (compressed + 26);
|
|
_mm256_storeu_si256(out + 26, w0 );
|
|
w1 = _mm256_lddqu_si256 (compressed + 27);
|
|
_mm256_storeu_si256(out + 27, w1 );
|
|
w0 = _mm256_lddqu_si256 (compressed + 28);
|
|
_mm256_storeu_si256(out + 28, w0 );
|
|
w1 = _mm256_lddqu_si256 (compressed + 29);
|
|
_mm256_storeu_si256(out + 29, w1 );
|
|
w0 = _mm256_lddqu_si256 (compressed + 30);
|
|
_mm256_storeu_si256(out + 30, w0 );
|
|
w1 = _mm256_lddqu_si256 (compressed + 31);
|
|
_mm256_storeu_si256(out + 31, w1 );
|
|
}
|
|
|
|
static avxpackblockfnc avxfuncPackArr[] = {
|
|
&avxpackblock0,
|
|
&avxpackblock1,
|
|
&avxpackblock2,
|
|
&avxpackblock3,
|
|
&avxpackblock4,
|
|
&avxpackblock5,
|
|
&avxpackblock6,
|
|
&avxpackblock7,
|
|
&avxpackblock8,
|
|
&avxpackblock9,
|
|
&avxpackblock10,
|
|
&avxpackblock11,
|
|
&avxpackblock12,
|
|
&avxpackblock13,
|
|
&avxpackblock14,
|
|
&avxpackblock15,
|
|
&avxpackblock16,
|
|
&avxpackblock17,
|
|
&avxpackblock18,
|
|
&avxpackblock19,
|
|
&avxpackblock20,
|
|
&avxpackblock21,
|
|
&avxpackblock22,
|
|
&avxpackblock23,
|
|
&avxpackblock24,
|
|
&avxpackblock25,
|
|
&avxpackblock26,
|
|
&avxpackblock27,
|
|
&avxpackblock28,
|
|
&avxpackblock29,
|
|
&avxpackblock30,
|
|
&avxpackblock31,
|
|
&avxpackblock32
|
|
};
|
|
static avxpackblockfnc avxfuncPackMaskArr[] = {
|
|
&avxpackblockmask0,
|
|
&avxpackblockmask1,
|
|
&avxpackblockmask2,
|
|
&avxpackblockmask3,
|
|
&avxpackblockmask4,
|
|
&avxpackblockmask5,
|
|
&avxpackblockmask6,
|
|
&avxpackblockmask7,
|
|
&avxpackblockmask8,
|
|
&avxpackblockmask9,
|
|
&avxpackblockmask10,
|
|
&avxpackblockmask11,
|
|
&avxpackblockmask12,
|
|
&avxpackblockmask13,
|
|
&avxpackblockmask14,
|
|
&avxpackblockmask15,
|
|
&avxpackblockmask16,
|
|
&avxpackblockmask17,
|
|
&avxpackblockmask18,
|
|
&avxpackblockmask19,
|
|
&avxpackblockmask20,
|
|
&avxpackblockmask21,
|
|
&avxpackblockmask22,
|
|
&avxpackblockmask23,
|
|
&avxpackblockmask24,
|
|
&avxpackblockmask25,
|
|
&avxpackblockmask26,
|
|
&avxpackblockmask27,
|
|
&avxpackblockmask28,
|
|
&avxpackblockmask29,
|
|
&avxpackblockmask30,
|
|
&avxpackblockmask31,
|
|
&avxpackblockmask32
|
|
};
|
|
static avxunpackblockfnc avxfuncUnpackArr[] = {
|
|
&avxunpackblock0,
|
|
&avxunpackblock1,
|
|
&avxunpackblock2,
|
|
&avxunpackblock3,
|
|
&avxunpackblock4,
|
|
&avxunpackblock5,
|
|
&avxunpackblock6,
|
|
&avxunpackblock7,
|
|
&avxunpackblock8,
|
|
&avxunpackblock9,
|
|
&avxunpackblock10,
|
|
&avxunpackblock11,
|
|
&avxunpackblock12,
|
|
&avxunpackblock13,
|
|
&avxunpackblock14,
|
|
&avxunpackblock15,
|
|
&avxunpackblock16,
|
|
&avxunpackblock17,
|
|
&avxunpackblock18,
|
|
&avxunpackblock19,
|
|
&avxunpackblock20,
|
|
&avxunpackblock21,
|
|
&avxunpackblock22,
|
|
&avxunpackblock23,
|
|
&avxunpackblock24,
|
|
&avxunpackblock25,
|
|
&avxunpackblock26,
|
|
&avxunpackblock27,
|
|
&avxunpackblock28,
|
|
&avxunpackblock29,
|
|
&avxunpackblock30,
|
|
&avxunpackblock31,
|
|
&avxunpackblock32
|
|
};
|
|
/** code generated by avxpacking.py ends here **/
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
/* reads 256 values from "in", writes "bit" 256-bit vectors to "out" */
|
|
void avxpack(const uint32_t * in,__m256i * out, const uint32_t bit) {
|
|
avxfuncPackMaskArr[bit](in,out);
|
|
}
|
|
|
|
/* reads 256 values from "in", writes "bit" 256-bit vectors to "out" */
|
|
void avxpackwithoutmask(const uint32_t * in,__m256i * out, const uint32_t bit) {
|
|
avxfuncPackArr[bit](in,out);
|
|
}
|
|
|
|
/* reads "bit" 256-bit vectors from "in", writes 256 values to "out" */
|
|
void avxunpack(const __m256i * in,uint32_t * out, const uint32_t bit) {
|
|
avxfuncUnpackArr[bit](in,out);
|
|
}
|
|
|
|
#endif /* __AVX2__ */
|