Merge commit 'f07ccd6e4fbc5bbfeb94d40e0f14bc527a7d5439' as 'cpp/simdcomp'

This commit is contained in:
Manuel Woelker
2017-01-26 20:28:23 +01:00
32 changed files with 95877 additions and 0 deletions

9
cpp/simdcomp/.gitignore vendored Normal file
View File

@@ -0,0 +1,9 @@
Makefile.in
lib*
unit*
*.o
src/*.lo
src/*.o
src/.deps
src/.dirstamp
src/.libs

11
cpp/simdcomp/.travis.yml Normal file
View File

@@ -0,0 +1,11 @@
language: c
sudo: false
compiler:
- gcc
- clang
branches:
only:
- master
script: make && ./unit

9
cpp/simdcomp/CHANGELOG Normal file
View File

@@ -0,0 +1,9 @@
Upcoming
- added missing include
- improved portability (MSVC)
- implemented C89 compatibility
Version 0.0.3 (19 May 2014)
- improved documentation
Version 0.0.2 (6 February 2014)
- added go demo
Version 0.0.1 (5 February 2014)

27
cpp/simdcomp/LICENSE Normal file
View File

@@ -0,0 +1,27 @@
Copyright (c) 2014--, The authors
All rights reserved.
Redistribution and use in source and binary forms, with or without modification,
are permitted provided that the following conditions are met:
* Redistributions of source code must retain the above copyright notice, this
list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above copyright notice, this
list of conditions and the following disclaimer in the documentation and/or
other materials provided with the distribution.
* Neither the name of the {organization} nor the names of its
contributors may be used to endorse or promote products derived from
this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR
ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

137
cpp/simdcomp/README.md Normal file
View File

@@ -0,0 +1,137 @@
The SIMDComp library
====================
[![Build Status](https://travis-ci.org/lemire/simdcomp.png)](https://travis-ci.org/lemire/simdcomp)
A simple C library for compressing lists of integers using binary packing and SIMD instructions.
The assumption is either that you have a list of 32-bit integers where most of them are small, or a list of 32-bit integers where differences between successive integers are small. No software is able to reliably compress an array of 32-bit random numbers.
This library can decode at least 4 billions of compressed integers per second on most
desktop or laptop processors. That is, it can decompress data at a rate of 15 GB/s.
This is significantly faster than generic codecs like gzip, LZO, Snappy or LZ4.
On a Skylake Intel processor, it can decode integers at a rate 0.3 cycles per integer,
which can easily translate into more than 8 decoded billions integers per second.
Contributors: Daniel Lemire, Nathan Kurz, Christoph Rupp, Anatol Belski, Nick White and others
What is it for?
-------------
This is a low-level library for fast integer compression. By design it does not define a compressed
format. It is up to the (sophisticated) user to create a compressed format.
Requirements
-------------
- Your processor should support SSE4.1 (It is supported by most Intel and AMD processors released since 2008.)
- It is possible to build the core part of the code if your processor support SSE2 (Pentium4 or better)
- C99 compliant compiler (GCC is assumed)
- A Linux-like distribution is assumed by the makefile
For a plain C version that does not use SIMD instructions, see https://github.com/lemire/LittleIntPacker
Usage
-------
Compression works over blocks of 128 integers.
For a complete working example, see example.c (you can build it and
run it with "make example; ./example").
1) Lists of integers in random order.
```C
const uint32_t b = maxbits(datain);// computes bit width
simdpackwithoutmask(datain, buffer, b);//compressed to buffer, compressing 128 32-bit integers down to b*32 bytes
simdunpack(buffer, backbuffer, b);//uncompressed to backbuffer
```
While 128 32-bit integers are read, only b 128-bit words are written. Thus, the compression ratio is 32/b.
2) Sorted lists of integers.
We used differential coding: we store the difference between successive integers. For this purpose, we need an initial value (called offset).
```C
uint32_t offset = 0;
uint32_t b1 = simdmaxbitsd1(offset,datain); // bit width
simdpackwithoutmaskd1(offset, datain, buffer, b1);//compressing 128 32-bit integers down to b1*32 bytes
simdunpackd1(offset, buffer, backbuffer, b1);//uncompressed
```
General example for arrays of arbitrary length:
```C
int compress_decompress_demo() {
size_t k, N = 9999;
__m128i * endofbuf;
uint32_t * datain = malloc(N * sizeof(uint32_t));
uint8_t * buffer;
uint32_t * backbuffer = malloc(N * sizeof(uint32_t));
uint32_t b;
for (k = 0; k < N; ++k){ /* start with k=0, not k=1! */
datain[k] = k;
}
b = maxbits_length(datain, N);
buffer = malloc(simdpack_compressedbytes(N,b)); // allocate just enough memory
endofbuf = simdpack_length(datain, N, (__m128i *)buffer, b);
/* compressed data is stored between buffer and endofbuf using (endofbuf-buffer)*sizeof(__m128i) bytes */
/* would be safe to do : buffer = realloc(buffer,(endofbuf-(__m128i *)buffer)*sizeof(__m128i)); */
simdunpack_length((const __m128i *)buffer, N, backbuffer, b);
for (k = 0; k < N; ++k){
if(datain[k] != backbuffer[k]) {
printf("bug\n");
return -1;
}
}
return 0;
}
```
3) Frame-of-Reference
We also have frame-of-reference (FOR) functions (see simdfor.h header). They work like the bit packing
routines, but do not use differential coding so they allow faster search in some cases, at the expense
of compression.
Setup
---------
make
make test
and if you are daring:
make install
Go
--------
If you are a go user, there is a "go" folder where you will find a simple demo.
Other libraries
----------------
* Fast decoder for VByte-compressed integers https://github.com/lemire/MaskedVByte
* Fast integer compression in C using StreamVByte https://github.com/lemire/streamvbyte
* FastPFOR is a C++ research library well suited to compress unsorted arrays: https://github.com/lemire/FastPFor
* SIMDCompressionAndIntersection is a C++ research library well suited for sorted arrays (differential coding)
and computing intersections: https://github.com/lemire/SIMDCompressionAndIntersection
* TurboPFor is a C library that offers lots of interesting optimizations. Well worth checking! (GPL license) https://github.com/powturbo/TurboPFor
* Oroch is a C++ library that offers a usable API (MIT license) https://github.com/ademakov/Oroch
References
------------
* Daniel Lemire, Leonid Boytsov, Nathan Kurz, SIMD Compression and the Intersection of Sorted Integers, Software Practice & Experience 46 (6) 2016. http://arxiv.org/abs/1401.6399
* Daniel Lemire and Leonid Boytsov, Decoding billions of integers per second through vectorization, Software Practice & Experience 45 (1), 2015. http://arxiv.org/abs/1209.2137 http://onlinelibrary.wiley.com/doi/10.1002/spe.2203/abstract
* Jeff Plaisance, Nathan Kurz, Daniel Lemire, Vectorized VByte Decoding, International Symposium on Web Algorithms 2015, 2015. http://arxiv.org/abs/1503.07387
* Wayne Xin Zhao, Xudong Zhang, Daniel Lemire, Dongdong Shan, Jian-Yun Nie, Hongfei Yan, Ji-Rong Wen, A General SIMD-based Approach to Accelerating Compression Algorithms, ACM Transactions on Information Systems 33 (3), 2015. http://arxiv.org/abs/1502.01916
* T. D. Wu, Bitpacking techniques for indexing genomes: I. Hash tables, Algorithms for Molecular Biology 11 (5), 2016. http://almob.biomedcentral.com/articles/10.1186/s13015-016-0069-5

View File

@@ -0,0 +1,235 @@
/**
* This code is released under a BSD License.
*/
#include <assert.h>
#include <stdio.h>
#include <stdlib.h>
#include <time.h>
#include "simdcomp.h"
#ifdef _MSC_VER
# include <windows.h>
__int64 freq;
typedef __int64 time_snap_t;
static time_snap_t time_snap(void)
{
__int64 now;
QueryPerformanceCounter((LARGE_INTEGER *)&now);
return (__int64)((now*1000000)/freq);
}
# define TIME_SNAP_FMT "%I64d"
#else
# define time_snap clock
# define TIME_SNAP_FMT "%lu"
typedef clock_t time_snap_t;
#endif
void benchmarkSelect() {
uint32_t buffer[128];
uint32_t backbuffer[128];
uint32_t initial = 33;
uint32_t b;
time_snap_t S1, S2, S3;
int i;
printf("benchmarking select \n");
/* this test creates delta encoded buffers with different bits, then
* performs lower bound searches for each key */
for (b = 0; b <= 32; b++) {
uint32_t prev = initial;
uint32_t out[128];
/* initialize the buffer */
for (i = 0; i < 128; i++) {
buffer[i] = ((uint32_t)(1655765 * i )) ;
if(b < 32) buffer[i] %= (1<<b);
}
for (i = 0; i < 128; i++) {
buffer[i] = buffer[i] + prev;
prev = buffer[i];
}
for (i = 1; i < 128; i++) {
if(buffer[i] < buffer[i-1] )
buffer[i] = buffer[i-1];
}
assert(simdmaxbitsd1(initial, buffer)<=b);
for (i = 0; i < 128; i++) {
out[i] = 0; /* memset would do too */
}
/* delta-encode to 'i' bits */
simdpackwithoutmaskd1(initial, buffer, (__m128i *)out, b);
S1 = time_snap();
for (i = 0; i < 128 * 10; i++) {
uint32_t valretrieved = simdselectd1(initial, (__m128i *)out, b, (uint32_t)i % 128);
assert(valretrieved == buffer[i%128]);
}
S2 = time_snap();
for (i = 0; i < 128 * 10; i++) {
simdunpackd1(initial, (__m128i *)out, backbuffer, b);
assert(backbuffer[i % 128] == buffer[i % 128]);
}
S3 = time_snap();
printf("bit width = %d, fast select function time = " TIME_SNAP_FMT ", naive time = " TIME_SNAP_FMT " \n", b, (S2-S1), (S3-S2));
}
}
int uint32_cmp(const void *a, const void *b)
{
const uint32_t *ia = (const uint32_t *)a;
const uint32_t *ib = (const uint32_t *)b;
if(*ia < *ib)
return -1;
else if (*ia > *ib)
return 1;
return 0;
}
/* adapted from wikipedia */
int binary_search(uint32_t * A, uint32_t key, int imin, int imax)
{
int imid;
imax --;
while(imin + 1 < imax) {
imid = imin + ((imax - imin) / 2);
if (A[imid] > key) {
imax = imid;
} else if (A[imid] < key) {
imin = imid;
} else {
return imid;
}
}
return imax;
}
/* adapted from wikipedia */
int lower_bound(uint32_t * A, uint32_t key, int imin, int imax)
{
int imid;
imax --;
while(imin + 1 < imax) {
imid = imin + ((imax - imin) / 2);
if (A[imid] >= key) {
imax = imid;
} else if (A[imid] < key) {
imin = imid;
}
}
if(A[imin] >= key) return imin;
return imax;
}
void benchmarkSearch() {
uint32_t buffer[128];
uint32_t backbuffer[128];
uint32_t out[128];
uint32_t result, initial = 0;
uint32_t b, i;
time_snap_t S1, S2, S3, S4;
printf("benchmarking search \n");
/* this test creates delta encoded buffers with different bits, then
* performs lower bound searches for each key */
for (b = 0; b <= 32; b++) {
uint32_t prev = initial;
/* initialize the buffer */
for (i = 0; i < 128; i++) {
buffer[i] = ((uint32_t)rand()) ;
if(b < 32) buffer[i] %= (1<<b);
}
qsort(buffer,128, sizeof(uint32_t), uint32_cmp);
for (i = 0; i < 128; i++) {
buffer[i] = buffer[i] + prev;
prev = buffer[i];
}
for (i = 1; i < 128; i++) {
if(buffer[i] < buffer[i-1] )
buffer[i] = buffer[i-1];
}
assert(simdmaxbitsd1(initial, buffer)<=b);
for (i = 0; i < 128; i++) {
out[i] = 0; /* memset would do too */
}
/* delta-encode to 'i' bits */
simdpackwithoutmaskd1(initial, buffer, (__m128i *)out, b);
simdunpackd1(initial, (__m128i *)out, backbuffer, b);
for (i = 0; i < 128; i++) {
assert(buffer[i] == backbuffer[i]);
}
S1 = time_snap();
for (i = 0; i < 128 * 10; i++) {
int pos;
uint32_t pseudorandomkey = buffer[i%128];
__m128i vecinitial = _mm_set1_epi32(initial);
pos = simdsearchd1(&vecinitial, (__m128i *)out, b,
pseudorandomkey, &result);
if((result < pseudorandomkey) || (buffer[pos] != result)) {
printf("bug A.\n");
} else if (pos > 0) {
if(buffer[pos-1] >= pseudorandomkey)
printf("bug B.\n");
}
}
S2 = time_snap();
for (i = 0; i < 128 * 10; i++) {
int pos;
uint32_t pseudorandomkey = buffer[i%128];
simdunpackd1(initial, (__m128i *)out, backbuffer, b);
pos = lower_bound(backbuffer, pseudorandomkey, 0, 128);
result = backbuffer[pos];
if((result < pseudorandomkey) || (buffer[pos] != result)) {
printf("bug C.\n");
} else if (pos > 0) {
if(buffer[pos-1] >= pseudorandomkey)
printf("bug D.\n");
}
}
S3 = time_snap();
for (i = 0; i < 128 * 10; i++) {
int pos;
uint32_t pseudorandomkey = buffer[i%128];
pos = simdsearchwithlengthd1(initial, (__m128i *)out, b, 128,
pseudorandomkey, &result);
if((result < pseudorandomkey) || (buffer[pos] != result)) {
printf("bug A.\n");
} else if (pos > 0) {
if(buffer[pos-1] >= pseudorandomkey)
printf("bug B.\n");
}
}
S4 = time_snap();
printf("bit width = %d, fast search function time = " TIME_SNAP_FMT ", naive time = " TIME_SNAP_FMT " , fast with length time = " TIME_SNAP_FMT " \n", b, (S2-S1), (S3-S2), (S4-S3) );
}
}
int main() {
#ifdef _MSC_VER
QueryPerformanceFrequency((LARGE_INTEGER *)&freq);
#endif
benchmarkSearch();
benchmarkSelect();
return 0;
}

View File

@@ -0,0 +1,205 @@
#include <stdio.h>
#include "simdcomp.h"
#define RDTSC_START(cycles) \
do { \
register unsigned cyc_high, cyc_low; \
__asm volatile( \
"cpuid\n\t" \
"rdtsc\n\t" \
"mov %%edx, %0\n\t" \
"mov %%eax, %1\n\t" \
: "=r"(cyc_high), "=r"(cyc_low)::"%rax", "%rbx", "%rcx", "%rdx"); \
(cycles) = ((uint64_t)cyc_high << 32) | cyc_low; \
} while (0)
#define RDTSC_FINAL(cycles) \
do { \
register unsigned cyc_high, cyc_low; \
__asm volatile( \
"rdtscp\n\t" \
"mov %%edx, %0\n\t" \
"mov %%eax, %1\n\t" \
"cpuid\n\t" \
: "=r"(cyc_high), "=r"(cyc_low)::"%rax", "%rbx", "%rcx", "%rdx"); \
(cycles) = ((uint64_t)cyc_high << 32) | cyc_low; \
} while (0)
uint32_t * get_random_array_from_bit_width(uint32_t length, uint32_t bit) {
uint32_t * answer = malloc(sizeof(uint32_t) * length);
uint32_t mask = (uint32_t) ((UINT64_C(1) << bit) - 1);
uint32_t i;
for(i = 0; i < length; ++i) {
answer[i] = rand() & mask;
}
return answer;
}
uint32_t * get_random_array_from_bit_width_d1(uint32_t length, uint32_t bit) {
uint32_t * answer = malloc(sizeof(uint32_t) * length);
uint32_t mask = (uint32_t) ((UINT64_C(1) << bit) - 1);
uint32_t i;
answer[0] = rand() & mask;
for(i = 1; i < length; ++i) {
answer[i] = answer[i-1] + (rand() & mask);
}
return answer;
}
void demo128() {
const uint32_t length = 128;
uint32_t bit;
printf("# --- %s\n", __func__);
printf("# compressing %d integers\n",length);
printf("# format: bit width, pack in cycles per int, unpack in cycles per int\n");
for(bit = 1; bit <= 32; ++bit) {
uint32_t i;
uint32_t * data = get_random_array_from_bit_width(length, bit);
__m128i * buffer = malloc(length * sizeof(uint32_t));
uint32_t * backdata = malloc(length * sizeof(uint32_t));
uint32_t repeat = 500;
uint64_t min_diff;
printf("%d\t",bit);
min_diff = (uint64_t)-1;
for (i = 0; i < repeat; i++) {
uint64_t cycles_start, cycles_final, cycles_diff;
__asm volatile("" ::: /* pretend to clobber */ "memory");
RDTSC_START(cycles_start);
simdpackwithoutmask(data,buffer, bit);
RDTSC_FINAL(cycles_final);
cycles_diff = (cycles_final - cycles_start);
if (cycles_diff < min_diff) min_diff = cycles_diff;
}
printf("%.2f\t",min_diff*1.0/length);
min_diff = (uint64_t)-1;
for (i = 0; i < repeat; i++) {
uint64_t cycles_start, cycles_final, cycles_diff;
__asm volatile("" ::: /* pretend to clobber */ "memory");
RDTSC_START(cycles_start);
simdunpack(buffer, backdata,bit);
RDTSC_FINAL(cycles_final);
cycles_diff = (cycles_final - cycles_start);
if (cycles_diff < min_diff) min_diff = cycles_diff;
}
printf("%.2f\t",min_diff*1.0/length);
free(data);
free(buffer);
free(backdata);
printf("\n");
}
printf("\n\n"); /* two blank lines are required by gnuplot */
}
void demo128_d1() {
const uint32_t length = 128;
uint32_t bit;
printf("# --- %s\n", __func__);
printf("# compressing %d integers\n",length);
printf("# format: bit width, pack in cycles per int, unpack in cycles per int\n");
for(bit = 1; bit <= 32; ++bit) {
uint32_t i;
uint32_t * data = get_random_array_from_bit_width_d1(length, bit);
__m128i * buffer = malloc(length * sizeof(uint32_t));
uint32_t * backdata = malloc(length * sizeof(uint32_t));
uint32_t repeat = 500;
uint64_t min_diff;
printf("%d\t",bit);
min_diff = (uint64_t)-1;
for (i = 0; i < repeat; i++) {
uint64_t cycles_start, cycles_final, cycles_diff;
__asm volatile("" ::: /* pretend to clobber */ "memory");
RDTSC_START(cycles_start);
simdpackwithoutmaskd1(0,data,buffer, bit);
RDTSC_FINAL(cycles_final);
cycles_diff = (cycles_final - cycles_start);
if (cycles_diff < min_diff) min_diff = cycles_diff;
}
printf("%.2f\t",min_diff*1.0/length);
min_diff = (uint64_t)-1;
for (i = 0; i < repeat; i++) {
uint64_t cycles_start, cycles_final, cycles_diff;
__asm volatile("" ::: /* pretend to clobber */ "memory");
RDTSC_START(cycles_start);
simdunpackd1(0,buffer, backdata,bit);
RDTSC_FINAL(cycles_final);
cycles_diff = (cycles_final - cycles_start);
if (cycles_diff < min_diff) min_diff = cycles_diff;
}
printf("%.2f\t",min_diff*1.0/length);
free(data);
free(buffer);
free(backdata);
printf("\n");
}
printf("\n\n"); /* two blank lines are required by gnuplot */
}
#ifdef __AVX2__
void demo256() {
const uint32_t length = 256;
uint32_t bit;
printf("# --- %s\n", __func__);
printf("# compressing %d integers\n",length);
printf("# format: bit width, pack in cycles per int, unpack in cycles per int\n");
for(bit = 1; bit <= 32; ++bit) {
uint32_t i;
uint32_t * data = get_random_array_from_bit_width(length, bit);
__m256i * buffer = malloc(length * sizeof(uint32_t));
uint32_t * backdata = malloc(length * sizeof(uint32_t));
uint32_t repeat = 500;
uint64_t min_diff;
printf("%d\t",bit);
min_diff = (uint64_t)-1;
for (i = 0; i < repeat; i++) {
uint64_t cycles_start, cycles_final, cycles_diff;
__asm volatile("" ::: /* pretend to clobber */ "memory");
RDTSC_START(cycles_start);
avxpackwithoutmask(data,buffer, bit);
RDTSC_FINAL(cycles_final);
cycles_diff = (cycles_final - cycles_start);
if (cycles_diff < min_diff) min_diff = cycles_diff;
}
printf("%.2f\t",min_diff*1.0/length);
min_diff = (uint64_t)-1;
for (i = 0; i < repeat; i++) {
uint64_t cycles_start, cycles_final, cycles_diff;
__asm volatile("" ::: /* pretend to clobber */ "memory");
RDTSC_START(cycles_start);
avxunpack(buffer, backdata,bit);
RDTSC_FINAL(cycles_final);
cycles_diff = (cycles_final - cycles_start);
if (cycles_diff < min_diff) min_diff = cycles_diff;
}
printf("%.2f\t",min_diff*1.0/length);
free(data);
free(buffer);
free(backdata);
printf("\n");
}
printf("\n\n"); /* two blank lines are required by gnuplot */
}
#endif /* avx 2 */
int main() {
demo128();
demo128_d1();
#ifdef __AVX2__
demo256();
#endif
return 0;
}

195
cpp/simdcomp/example.c Normal file
View File

@@ -0,0 +1,195 @@
/* Type "make example" to build this example program. */
#include <stdio.h>
#include <time.h>
#include <stdlib.h>
#include "simdcomp.h"
/**
We provide several different code examples.
**/
/* very simple test to illustrate a simple application */
int compress_decompress_demo() {
size_t k, N = 9999;
__m128i * endofbuf;
int howmanybytes;
float compratio;
uint32_t * datain = malloc(N * sizeof(uint32_t));
uint8_t * buffer;
uint32_t * backbuffer = malloc(N * sizeof(uint32_t));
uint32_t b;
printf("== simple test\n");
for (k = 0; k < N; ++k) { /* start with k=0, not k=1! */
datain[k] = k;
}
b = maxbits_length(datain, N);
buffer = malloc(simdpack_compressedbytes(N,b));
endofbuf = simdpack_length(datain, N, (__m128i *)buffer, b);
howmanybytes = (endofbuf-(__m128i *)buffer)*sizeof(__m128i); /* number of compressed bytes */
compratio = N*sizeof(uint32_t) * 1.0 / howmanybytes;
/* endofbuf points to the end of the compressed data */
buffer = realloc(buffer,(endofbuf-(__m128i *)buffer)*sizeof(__m128i)); /* optional but safe. */
printf("Compressed %d integers down to %d bytes (comp. ratio = %f).\n",(int)N,howmanybytes,compratio);
/* in actual applications b must be stored and retrieved: caller is responsible for that. */
simdunpack_length((const __m128i *)buffer, N, backbuffer, b); /* will return a pointer to endofbuf */
for (k = 0; k < N; ++k) {
if(datain[k] != backbuffer[k]) {
printf("bug at %lu \n",(unsigned long)k);
return -1;
}
}
printf("Code works!\n");
free(datain);
free(buffer);
free(backbuffer);
return 0;
}
/* compresses data from datain to buffer, returns how many bytes written
used below in simple_demo */
size_t compress(uint32_t * datain, size_t length, uint8_t * buffer) {
uint32_t offset;
uint8_t * initout;
size_t k;
if(length/SIMDBlockSize*SIMDBlockSize != length) {
printf("Data length should be a multiple of %i \n",SIMDBlockSize);
}
offset = 0;
initout = buffer;
for(k = 0; k < length / SIMDBlockSize; ++k) {
uint32_t b = simdmaxbitsd1(offset,
datain + k * SIMDBlockSize);
*buffer++ = b;
simdpackwithoutmaskd1(offset, datain + k * SIMDBlockSize, (__m128i *) buffer,
b);
offset = datain[k * SIMDBlockSize + SIMDBlockSize - 1];
buffer += b * sizeof(__m128i);
}
return buffer - initout;
}
/* Another illustration ... */
void simple_demo() {
size_t REPEAT = 10, gap;
size_t N = 1000 * SIMDBlockSize;/* SIMDBlockSize is 128 */
uint32_t * datain = malloc(N * sizeof(uint32_t));
size_t compsize;
clock_t start, end;
uint8_t * buffer = malloc(N * sizeof(uint32_t) + N / SIMDBlockSize); /* output buffer */
uint32_t * backbuffer = malloc(SIMDBlockSize * sizeof(uint32_t));
printf("== simple demo\n");
for (gap = 1; gap <= 243; gap *= 3) {
size_t k, repeat;
uint32_t offset = 0;
uint32_t bogus = 0;
double numberofseconds;
printf("\n");
printf(" gap = %lu \n", (unsigned long) gap);
datain[0] = 0;
for (k = 1; k < N; ++k)
datain[k] = datain[k-1] + ( rand() % (gap + 1) );
compsize = compress(datain,N,buffer);
printf("compression ratio = %f \n", (N * sizeof(uint32_t))/ (compsize * 1.0 ));
start = clock();
for(repeat = 0; repeat < REPEAT; ++repeat) {
uint8_t * decbuffer = buffer;
for (k = 0; k * SIMDBlockSize < N; ++k) {
uint8_t b = *decbuffer++;
simdunpackd1(offset, (__m128i *) decbuffer, backbuffer, b);
/* do something here with backbuffer */
bogus += backbuffer[3];
decbuffer += b * sizeof(__m128i);
offset = backbuffer[SIMDBlockSize - 1];
}
}
end = clock();
numberofseconds = (end-start)/(double)CLOCKS_PER_SEC;
printf("decoding speed in million of integers per second %f \n",N*REPEAT/(numberofseconds*1000.0*1000.0));
start = clock();
for(repeat = 0; repeat < REPEAT; ++repeat) {
uint8_t * decbuffer = buffer;
for (k = 0; k * SIMDBlockSize < N; ++k) {
memcpy(backbuffer,decbuffer+k*SIMDBlockSize,SIMDBlockSize*sizeof(uint32_t));
bogus += backbuffer[3] - backbuffer[100];
}
}
end = clock();
numberofseconds = (end-start)/(double)CLOCKS_PER_SEC;
printf("memcpy speed in million of integers per second %f \n",N*REPEAT/(numberofseconds*1000.0*1000.0));
printf("ignore me %i \n",bogus);
printf("All tests are in CPU cache. Avoid out-of-cache decoding in applications.\n");
}
free(buffer);
free(datain);
free(backbuffer);
}
/* Used below in more_sophisticated_demo ... */
size_t varying_bit_width_compress(uint32_t * datain, size_t length, uint8_t * buffer) {
uint8_t * initout;
size_t k;
if(length/SIMDBlockSize*SIMDBlockSize != length) {
printf("Data length should be a multiple of %i \n",SIMDBlockSize);
}
initout = buffer;
for(k = 0; k < length / SIMDBlockSize; ++k) {
uint32_t b = maxbits(datain);
*buffer++ = b;
simdpackwithoutmask(datain, (__m128i *)buffer, b);
datain += SIMDBlockSize;
buffer += b * sizeof(__m128i);
}
return buffer - initout;
}
/* Here we compress the data in blocks of 128 integers with varying bit width */
int varying_bit_width_demo() {
size_t nn = 128 * 2;
uint32_t * datainn = malloc(nn * sizeof(uint32_t));
uint8_t * buffern = malloc(nn * sizeof(uint32_t) + nn / SIMDBlockSize);
uint8_t * initbuffern = buffern;
uint32_t * backbuffern = malloc(nn * sizeof(uint32_t));
size_t k, compsize;
printf("== varying bit-width demo\n");
for(k=0; k<nn; ++k) {
datainn[k] = rand() % (k + 1);
}
compsize = varying_bit_width_compress(datainn,nn,buffern);
printf("encoded size: %u (original size: %u)\n", (unsigned)compsize,
(unsigned)(nn * sizeof(uint32_t)));
for (k = 0; k * SIMDBlockSize < nn; ++k) {
uint32_t b = *buffern;
buffern++;
simdunpack((const __m128i *)buffern, backbuffern + k * SIMDBlockSize, b);
buffern += b * sizeof(__m128i);
}
for (k = 0; k < nn; ++k) {
if(backbuffern[k] != datainn[k]) {
printf("bug\n");
return -1;
}
}
printf("Code works!\n");
free(datainn);
free(initbuffern);
free(backbuffern);
return 0;
}
int main() {
if(compress_decompress_demo() != 0) return -1;
if(varying_bit_width_demo() != 0) return -1;
simple_demo();
return 0;
}

13
cpp/simdcomp/go/README.md Normal file
View File

@@ -0,0 +1,13 @@
Simple Go demo
==============
Setup
======
Start by installing the simdcomp library (make && make install).
Then type:
go run test.go

71
cpp/simdcomp/go/test.go Normal file
View File

@@ -0,0 +1,71 @@
/////////
// This particular file is in the public domain.
// Author: Daniel Lemire
////////
package main
/*
#cgo LDFLAGS: -lsimdcomp
#include <simdcomp.h>
*/
import "C"
import "fmt"
//////////
// For this demo, we pack and unpack blocks of 128 integers
/////////
func main() {
// I am going to use C types. Alternative might be to use unsafe.Pointer calls, see http://bit.ly/1ndw3W3
// this is our original data
var data [128]C.uint32_t
for i := C.uint32_t(0); i < C.uint32_t(128); i++ {
data[i] = i
}
////////////
// We first pack without differential coding
///////////
// computing how many bits per int. is needed
b := C.maxbits(&data[0])
ratio := 32.0/float64(b)
fmt.Println("Bit width ", b)
fmt.Println(fmt.Sprintf("Compression ratio %f ", ratio))
// we are now going to create a buffer to receive the packed data (each __m128i uses 128 bits)
out := make([] C.__m128i,b)
C.simdpackwithoutmask( &data[0],&out[0],b);
var recovereddata [128]C.uint32_t
C.simdunpack(&out[0],&recovereddata[0],b)
for i := 0; i < 128; i++ {
if data[i] != recovereddata[i] {
fmt.Println("Bug ")
return
}
}
///////////
// Next, we use differential coding
//////////
offset := C.uint32_t(0) // if you pack data from K to K + 128, offset should be the value at K-1. When K = 0, choose a default
b1 := C.simdmaxbitsd1(offset,&data[0])
ratio1 := 32.0/float64(b1)
fmt.Println("Bit width ", b1)
fmt.Println(fmt.Sprintf("Compression ratio %f ", ratio1))
// we are now going to create a buffer to receive the packed data (each __m128i uses 128 bits)
out = make([] C.__m128i,b1)
C.simdpackwithoutmaskd1(offset, &data[0],&out[0],b1);
C.simdunpackd1(offset,&out[0],&recovereddata[0],b1)
for i := 0; i < 128; i++ {
if data[i] != recovereddata[i] {
fmt.Println("Bug ")
return
}
}
fmt.Println("test succesful.")
}

View File

@@ -0,0 +1,40 @@
/**
* This code is released under a BSD License.
*/
#ifndef INCLUDE_AVXBITPACKING_H_
#define INCLUDE_AVXBITPACKING_H_
#ifdef __AVX2__
#include "portability.h"
/* AVX2 is required */
#include <immintrin.h>
/* for memset */
#include <string.h>
#include "simdcomputil.h"
enum{ AVXBlockSize = 256};
/* max integer logarithm over a range of AVXBlockSize integers (256 integer) */
uint32_t avxmaxbits(const uint32_t * begin);
/* reads 256 values from "in", writes "bit" 256-bit vectors to "out" */
void avxpack(const uint32_t * in,__m256i * out, const uint32_t bit);
/* reads 256 values from "in", writes "bit" 256-bit vectors to "out" */
void avxpackwithoutmask(const uint32_t * in,__m256i * out, const uint32_t bit);
/* reads "bit" 256-bit vectors from "in", writes 256 values to "out" */
void avxunpack(const __m256i * in,uint32_t * out, const uint32_t bit);
#endif /* __AVX2__ */
#endif /* INCLUDE_AVXBITPACKING_H_ */

View File

@@ -0,0 +1,81 @@
/**
* This code is released under a BSD License.
*/
#ifndef SIMDBITCOMPAT_H_
#define SIMDBITCOMPAT_H_
#include <iso646.h> /* mostly for Microsoft compilers */
#include <string.h>
#if SIMDCOMP_DEBUG
# define SIMDCOMP_ALWAYS_INLINE inline
# define SIMDCOMP_NEVER_INLINE
# define SIMDCOMP_PURE
#else
# if defined(__GNUC__)
# if __GNUC__ >= 3
# define SIMDCOMP_ALWAYS_INLINE inline __attribute__((always_inline))
# define SIMDCOMP_NEVER_INLINE __attribute__((noinline))
# define SIMDCOMP_PURE __attribute__((pure))
# else
# define SIMDCOMP_ALWAYS_INLINE inline
# define SIMDCOMP_NEVER_INLINE
# define SIMDCOMP_PURE
# endif
# elif defined(_MSC_VER)
# define SIMDCOMP_ALWAYS_INLINE __forceinline
# define SIMDCOMP_NEVER_INLINE
# define SIMDCOMP_PURE
# else
# if __has_attribute(always_inline)
# define SIMDCOMP_ALWAYS_INLINE inline __attribute__((always_inline))
# else
# define SIMDCOMP_ALWAYS_INLINE inline
# endif
# if __has_attribute(noinline)
# define SIMDCOMP_NEVER_INLINE __attribute__((noinline))
# else
# define SIMDCOMP_NEVER_INLINE
# endif
# if __has_attribute(pure)
# define SIMDCOMP_PURE __attribute__((pure))
# else
# define SIMDCOMP_PURE
# endif
# endif
#endif
#if defined(_MSC_VER) && _MSC_VER < 1600
typedef unsigned int uint32_t;
typedef unsigned char uint8_t;
typedef signed char int8_t;
#else
#include <stdint.h> /* part of Visual Studio 2010 and better, others likely anyway */
#endif
#if defined(_MSC_VER)
#define SIMDCOMP_ALIGNED(x) __declspec(align(x))
#else
#if defined(__GNUC__)
#define SIMDCOMP_ALIGNED(x) __attribute__ ((aligned(x)))
#endif
#endif
#if defined(_MSC_VER)
# include <intrin.h>
/* 64-bit needs extending */
# define SIMDCOMP_CTZ(result, mask) do { \
unsigned long index; \
if (!_BitScanForward(&(index), (mask))) { \
(result) = 32U; \
} else { \
(result) = (uint32_t)(index); \
} \
} while (0)
#else
# define SIMDCOMP_CTZ(result, mask) \
result = __builtin_ctz(mask)
#endif
#endif /* SIMDBITCOMPAT_H_ */

View File

@@ -0,0 +1,72 @@
/**
* This code is released under a BSD License.
*/
#ifndef SIMDBITPACKING_H_
#define SIMDBITPACKING_H_
#include "portability.h"
/* SSE2 is required */
#include <emmintrin.h>
/* for memset */
#include <string.h>
#include "simdcomputil.h"
/***
* Please see example.c for various examples on how to make good use
* of these functions.
*/
/* reads 128 values from "in", writes "bit" 128-bit vectors to "out".
* The input values are masked so that only the least significant "bit" bits are used. */
void simdpack(const uint32_t * in,__m128i * out, const uint32_t bit);
/* reads 128 values from "in", writes "bit" 128-bit vectors to "out".
* The input values are assumed to be less than 1<<bit. */
void simdpackwithoutmask(const uint32_t * in,__m128i * out, const uint32_t bit);
/* reads "bit" 128-bit vectors from "in", writes 128 values to "out" */
void simdunpack(const __m128i * in,uint32_t * out, const uint32_t bit);
/* how many compressed bytes are needed to compressed length integers using a bit width of bit with
the simdpackFOR_length function. */
int simdpack_compressedbytes(int length, const uint32_t bit);
/* like simdpack, but supports an undetermined number of inputs.
* This is useful if you need to unpack an array of integers that is not divisible by 128 integers.
* Returns a pointer to the (advanced) compressed array. Compressed data is stored in the memory location between
the provided (out) pointer and the returned pointer. */
__m128i * simdpack_length(const uint32_t * in, size_t length, __m128i * out, const uint32_t bit);
/* like simdunpack, but supports an undetermined number of inputs.
* This is useful if you need to unpack an array of integers that is not divisible by 128 integers.
* Returns a pointer to the (advanced) compressed array. The read compressed data is between the provided
(in) pointer and the returned pointer. */
const __m128i * simdunpack_length(const __m128i * in, size_t length, uint32_t * out, const uint32_t bit);
/* like simdpack, but supports an undetermined small number of inputs. This is useful if you need to pack less
than 128 integers.
* Note that this function is much slower.
* Returns a pointer to the (advanced) compressed array. Compressed data is stored in the memory location
between the provided (out) pointer and the returned pointer. */
__m128i * simdpack_shortlength(const uint32_t * in, int length, __m128i * out, const uint32_t bit);
/* like simdunpack, but supports an undetermined small number of inputs. This is useful if you need to unpack less
than 128 integers.
* Note that this function is much slower.
* Returns a pointer to the (advanced) compressed array. The read compressed data is between the provided (in)
pointer and the returned pointer. */
const __m128i * simdunpack_shortlength(const __m128i * in, int length, uint32_t * out, const uint32_t bit);
/* given a block of 128 packed values, this function sets the value at index "index" to "value" */
void simdfastset(__m128i * in128, uint32_t b, uint32_t value, size_t index);
#endif /* SIMDBITPACKING_H_ */

View File

@@ -0,0 +1,22 @@
/**
* This code is released under a BSD License.
*/
#ifndef SIMDCOMP_H_
#define SIMDCOMP_H_
#ifdef __cplusplus
extern "C" {
#endif
#include "simdbitpacking.h"
#include "simdcomputil.h"
#include "simdfor.h"
#include "simdintegratedbitpacking.h"
#include "avxbitpacking.h"
#ifdef __cplusplus
} // extern "C"
#endif
#endif

View File

@@ -0,0 +1,54 @@
/**
* This code is released under a BSD License.
*/
#ifndef SIMDCOMPUTIL_H_
#define SIMDCOMPUTIL_H_
#include "portability.h"
/* SSE2 is required */
#include <emmintrin.h>
/* returns the integer logarithm of v (bit width) */
uint32_t bits(const uint32_t v);
/* max integer logarithm over a range of SIMDBlockSize integers (128 integer) */
uint32_t maxbits(const uint32_t * begin);
/* same as maxbits, but we specify the number of integers */
uint32_t maxbits_length(const uint32_t * in,uint32_t length);
enum{ SIMDBlockSize = 128};
/* computes (quickly) the minimal value of 128 values */
uint32_t simdmin(const uint32_t * in);
/* computes (quickly) the minimal value of the specified number of values */
uint32_t simdmin_length(const uint32_t * in, uint32_t length);
#ifdef __SSE4_1__
/* computes (quickly) the minimal and maximal value of the specified number of values */
void simdmaxmin_length(const uint32_t * in, uint32_t length, uint32_t * getmin, uint32_t * getmax);
/* computes (quickly) the minimal and maximal value of the 128 values */
void simdmaxmin(const uint32_t * in, uint32_t * getmin, uint32_t * getmax);
#endif
/* like maxbit over 128 integers (SIMDBlockSize) with provided initial value
and using differential coding */
uint32_t simdmaxbitsd1(uint32_t initvalue, const uint32_t * in);
/* like simdmaxbitsd1, but calculates maxbits over |length| integers
with provided initial value. |length| can be any arbitrary value. */
uint32_t simdmaxbitsd1_length(uint32_t initvalue, const uint32_t * in,
uint32_t length);
#endif /* SIMDCOMPUTIL_H_ */

View File

@@ -0,0 +1,72 @@
/**
* This code is released under a BSD License.
*/
#ifndef INCLUDE_SIMDFOR_H_
#define INCLUDE_SIMDFOR_H_
#include "portability.h"
/* SSE2 is required */
#include <emmintrin.h>
#include "simdcomputil.h"
#include "simdbitpacking.h"
#ifdef __cplusplus
extern "C" {
#endif
/* reads 128 values from "in", writes "bit" 128-bit vectors to "out" */
void simdpackFOR(uint32_t initvalue, const uint32_t * in,__m128i * out, const uint32_t bit);
/* reads "bit" 128-bit vectors from "in", writes 128 values to "out" */
void simdunpackFOR(uint32_t initvalue, const __m128i * in,uint32_t * out, const uint32_t bit);
/* how many compressed bytes are needed to compressed length integers using a bit width of bit with
the simdpackFOR_length function. */
int simdpackFOR_compressedbytes(int length, const uint32_t bit);
/* like simdpackFOR, but supports an undetermined number of inputs.
This is useful if you need to pack less than 128 integers. Note that this function is much slower.
Compressed data is stored in the memory location between
the provided (out) pointer and the returned pointer. */
__m128i * simdpackFOR_length(uint32_t initvalue, const uint32_t * in, int length, __m128i * out, const uint32_t bit);
/* like simdunpackFOR, but supports an undetermined number of inputs.
This is useful if you need to unpack less than 128 integers. Note that this function is much slower.
The read compressed data is between the provided
(in) pointer and the returned pointer. */
const __m128i * simdunpackFOR_length(uint32_t initvalue, const __m128i * in, int length, uint32_t * out, const uint32_t bit);
/* returns the value stored at the specified "slot".
* */
uint32_t simdselectFOR(uint32_t initvalue, const __m128i *in, uint32_t bit,
int slot);
/* given a block of 128 packed values, this function sets the value at index "index" to "value" */
void simdfastsetFOR(uint32_t initvalue, __m128i * in, uint32_t bit, uint32_t value, size_t index);
/* searches "bit" 128-bit vectors from "in" (= length<=128 encoded integers) for the first encoded uint32 value
* which is >= |key|, and returns its position. It is assumed that the values
* stored are in sorted order.
* The encoded key is stored in "*presult".
* The first length decoded integers, ignoring others. If no value is larger or equal to the key,
* length is returned. Length should be no larger than 128.
*
* If no value is larger or equal to the key,
* length is returned */
int simdsearchwithlengthFOR(uint32_t initvalue, const __m128i *in, uint32_t bit,
int length, uint32_t key, uint32_t *presult);
#ifdef __cplusplus
} // extern "C"
#endif
#endif /* INCLUDE_SIMDFOR_H_ */

View File

@@ -0,0 +1,98 @@
/**
* This code is released under a BSD License.
*/
#ifndef SIMD_INTEGRATED_BITPACKING_H
#define SIMD_INTEGRATED_BITPACKING_H
#include "portability.h"
/* SSE2 is required */
#include <emmintrin.h>
#include "simdcomputil.h"
#include "simdbitpacking.h"
#ifdef __cplusplus
extern "C" {
#endif
/* reads 128 values from "in", writes "bit" 128-bit vectors to "out"
integer values should be in sorted order (for best results).
The differences are masked so that only the least significant "bit" bits are used. */
void simdpackd1(uint32_t initvalue, const uint32_t * in,__m128i * out, const uint32_t bit);
/* reads 128 values from "in", writes "bit" 128-bit vectors to "out"
integer values should be in sorted order (for best results).
The difference values are assumed to be less than 1<<bit. */
void simdpackwithoutmaskd1(uint32_t initvalue, const uint32_t * in,__m128i * out, const uint32_t bit);
/* reads "bit" 128-bit vectors from "in", writes 128 values to "out" */
void simdunpackd1(uint32_t initvalue, const __m128i * in,uint32_t * out, const uint32_t bit);
/* searches "bit" 128-bit vectors from "in" (= 128 encoded integers) for the first encoded uint32 value
* which is >= |key|, and returns its position. It is assumed that the values
* stored are in sorted order.
* The encoded key is stored in "*presult". If no value is larger or equal to the key,
* 128 is returned. The pointer initOffset is a pointer to the last four value decoded
* (when starting out, this can be a zero vector or initialized with _mm_set1_epi32(init)),
* and the vector gets updated.
**/
int
simdsearchd1(__m128i * initOffset, const __m128i *in, uint32_t bit,
uint32_t key, uint32_t *presult);
/* searches "bit" 128-bit vectors from "in" (= length<=128 encoded integers) for the first encoded uint32 value
* which is >= |key|, and returns its position. It is assumed that the values
* stored are in sorted order.
* The encoded key is stored in "*presult".
* The first length decoded integers, ignoring others. If no value is larger or equal to the key,
* length is returned. Length should be no larger than 128.
*
* If no value is larger or equal to the key,
* length is returned */
int simdsearchwithlengthd1(uint32_t initvalue, const __m128i *in, uint32_t bit,
int length, uint32_t key, uint32_t *presult);
/* returns the value stored at the specified "slot".
* */
uint32_t simdselectd1(uint32_t initvalue, const __m128i *in, uint32_t bit,
int slot);
/* given a block of 128 packed values, this function sets the value at index "index" to "value",
* you must somehow know the previous value.
* Because of differential coding, all following values are incremented by the offset between this new
* value and the old value...
* This functions is useful if you want to modify the last value.
*/
void simdfastsetd1fromprevious( __m128i * in, uint32_t bit, uint32_t previousvalue, uint32_t value, size_t index);
/* given a block of 128 packed values, this function sets the value at index "index" to "value",
* This function computes the previous value if needed.
* Because of differential coding, all following values are incremented by the offset between this new
* value and the old value...
* This functions is useful if you want to modify the last value.
*/
void simdfastsetd1(uint32_t initvalue, __m128i * in, uint32_t bit, uint32_t value, size_t index);
/*Simply scan the data
* The pointer initOffset is a pointer to the last four value decoded
* (when starting out, this can be a zero vector or initialized with _mm_set1_epi32(init);),
* and the vector gets updated.
* */
void
simdscand1(__m128i * initOffset, const __m128i *in, uint32_t bit);
#ifdef __cplusplus
} // extern "C"
#endif
#endif

79
cpp/simdcomp/makefile Normal file
View File

@@ -0,0 +1,79 @@
# minimalist makefile
.SUFFIXES:
#
.SUFFIXES: .cpp .o .c .h
ifeq ($(DEBUG),1)
CFLAGS = -fPIC -std=c89 -ggdb -msse4.1 -march=native -Wall -Wextra -Wshadow -fsanitize=undefined -fno-omit-frame-pointer -fsanitize=address
else
CFLAGS = -fPIC -std=c89 -O3 -msse4.1 -march=native -Wall -Wextra -Wshadow
endif # debug
LDFLAGS = -shared
LIBNAME=libsimdcomp.so.0.0.3
all: unit unit_chars bitpackingbenchmark $(LIBNAME)
test:
./unit
./unit_chars
install: $(OBJECTS)
cp $(LIBNAME) /usr/local/lib
ln -s /usr/local/lib/$(LIBNAME) /usr/local/lib/libsimdcomp.so
ldconfig
cp $(HEADERS) /usr/local/include
HEADERS=./include/simdbitpacking.h ./include/simdcomputil.h ./include/simdintegratedbitpacking.h ./include/simdcomp.h ./include/simdfor.h ./include/avxbitpacking.h
uninstall:
for h in $(HEADERS) ; do rm /usr/local/$$h; done
rm /usr/local/lib/$(LIBNAME)
rm /usr/local/lib/libsimdcomp.so
ldconfig
OBJECTS= simdbitpacking.o simdintegratedbitpacking.o simdcomputil.o \
simdpackedsearch.o simdpackedselect.o simdfor.o avxbitpacking.o
$(LIBNAME): $(OBJECTS)
$(CC) $(CFLAGS) -o $(LIBNAME) $(OBJECTS) $(LDFLAGS)
avxbitpacking.o: ./src/avxbitpacking.c $(HEADERS)
$(CC) $(CFLAGS) -c ./src/avxbitpacking.c -Iinclude
simdfor.o: ./src/simdfor.c $(HEADERS)
$(CC) $(CFLAGS) -c ./src/simdfor.c -Iinclude
simdcomputil.o: ./src/simdcomputil.c $(HEADERS)
$(CC) $(CFLAGS) -c ./src/simdcomputil.c -Iinclude
simdbitpacking.o: ./src/simdbitpacking.c $(HEADERS)
$(CC) $(CFLAGS) -c ./src/simdbitpacking.c -Iinclude
simdintegratedbitpacking.o: ./src/simdintegratedbitpacking.c $(HEADERS)
$(CC) $(CFLAGS) -c ./src/simdintegratedbitpacking.c -Iinclude
simdpackedsearch.o: ./src/simdpackedsearch.c $(HEADERS)
$(CC) $(CFLAGS) -c ./src/simdpackedsearch.c -Iinclude
simdpackedselect.o: ./src/simdpackedselect.c $(HEADERS)
$(CC) $(CFLAGS) -c ./src/simdpackedselect.c -Iinclude
example: ./example.c $(HEADERS) $(OBJECTS)
$(CC) $(CFLAGS) -o example ./example.c -Iinclude $(OBJECTS)
unit: ./tests/unit.c $(HEADERS) $(OBJECTS)
$(CC) $(CFLAGS) -o unit ./tests/unit.c -Iinclude $(OBJECTS)
bitpackingbenchmark: ./benchmarks/bitpackingbenchmark.c $(HEADERS) $(OBJECTS)
$(CC) $(CFLAGS) -o bitpackingbenchmark ./benchmarks/bitpackingbenchmark.c -Iinclude $(OBJECTS)
benchmark: ./benchmarks/benchmark.c $(HEADERS) $(OBJECTS)
$(CC) $(CFLAGS) -o benchmark ./benchmarks/benchmark.c -Iinclude $(OBJECTS)
dynunit: ./tests/unit.c $(HEADERS) $(LIBNAME)
$(CC) $(CFLAGS) -o dynunit ./tests/unit.c -Iinclude -lsimdcomp
unit_chars: ./tests/unit_chars.c $(HEADERS) $(OBJECTS)
$(CC) $(CFLAGS) -o unit_chars ./tests/unit_chars.c -Iinclude $(OBJECTS)
clean:
rm -f unit *.o $(LIBNAME) example benchmark bitpackingbenchmark dynunit unit_chars

104
cpp/simdcomp/makefile.vc Normal file
View File

@@ -0,0 +1,104 @@
!IFNDEF MACHINE
!IF "$(PROCESSOR_ARCHITECTURE)"=="AMD64"
MACHINE=x64
!ELSE
MACHINE=x86
!ENDIF
!ENDIF
!IFNDEF DEBUG
DEBUG=no
!ENDIF
!IFNDEF CC
CC=cl.exe
!ENDIF
!IFNDEF AR
AR=lib.exe
!ENDIF
!IFNDEF LINK
LINK=link.exe
!ENDIF
!IFNDEF PGO
PGO=no
!ENDIF
!IFNDEF PGI
PGI=no
!ENDIF
INC = /Iinclude
!IF "$(DEBUG)"=="yes"
CFLAGS = /nologo /MDd /LDd /Od /Zi /D_DEBUG /RTC1 /W3 /GS /Gm
ARFLAGS = /nologo
LDFLAGS = /nologo /debug /nodefaultlib:msvcrt
!ELSE
CFLAGS = /nologo /MD /O2 /Zi /DNDEBUG /W3 /Gm- /GS /Gy /Oi /GL /MP
ARFLAGS = /nologo /LTCG
LDFLAGS = /nologo /LTCG /DYNAMICBASE /incremental:no /debug /opt:ref,icf
!ENDIF
!IF "$(PGI)"=="yes"
LDFLAGS = $(LDFLAGS) /ltcg:pgi
!ENDIF
!IF "$(PGO)"=="yes"
LDFLAGS = $(LDFLAGS) /ltcg:pgo
!ENDIF
LIB_OBJS = simdbitpacking.obj simdintegratedbitpacking.obj simdcomputil.obj \
simdpackedsearch.obj simdpackedselect.obj simdfor.obj
all: lib dll dynunit unit_chars example benchmark
# need some good use case scenario to train the instrumented build
@if "$(PGI)"=="yes" echo Running PGO training
@if "$(PGI)"=="yes" benchmark.exe >nul 2>&1
@if "$(PGI)"=="yes" example.exe >nul 2>&1
$(LIB_OBJS):
$(CC) $(INC) $(CFLAGS) /c src/simdbitpacking.c src/simdintegratedbitpacking.c src/simdcomputil.c \
src/simdpackedsearch.c src/simdpackedselect.c src/simdfor.c
lib: $(LIB_OBJS)
$(AR) $(ARFLAGS) /OUT:simdcomp_a.lib $(LIB_OBJS)
dll: $(LIB_OBJS)
$(LINK) /DLL $(LDFLAGS) /OUT:simdcomp.dll /IMPLIB:simdcomp.lib /DEF:simdcomp.def $(LIB_OBJS)
unit: lib
$(CC) $(INC) $(CFLAGS) /c src/unit.c
$(LINK) $(LDFLAGS) /OUT:unit.exe unit.obj simdcomp_a.lib
dynunit: dll
$(CC) $(INC) $(CFLAGS) /c src/unit.c
$(LINK) $(LDFLAGS) /OUT:unit.exe unit.obj simdcomp.lib
unit_chars: lib
$(CC) $(INC) $(CFLAGS) /c src/unit_chars.c
$(LINK) $(LDFLAGS) /OUT:unit_chars.exe unit_chars.obj simdcomp.lib
example: lib
$(CC) $(INC) $(CFLAGS) /c example.c
$(LINK) $(LDFLAGS) /OUT:example.exe example.obj simdcomp.lib
benchmark: lib
$(CC) $(INC) $(CFLAGS) /c src/benchmark.c
$(LINK) $(LDFLAGS) /OUT:benchmark.exe benchmark.obj simdcomp.lib
clean:
del /Q *.obj
del /Q *.lib
del /Q *.exe
del /Q *.dll
del /Q *.pgc
del /Q *.pgd
del /Q *.pdb

16
cpp/simdcomp/package.json Normal file
View File

@@ -0,0 +1,16 @@
{
"name": "simdcomp",
"version": "0.0.3",
"repo": "lemire/simdcomp",
"description": "A simple C library for compressing lists of integers",
"license": "BSD-3-Clause",
"src": [
"src/simdbitpacking.c",
"src/simdcomputil.c",
"src/simdintegratedbitpacking.c",
"include/simdbitpacking.h",
"include/simdcomp.h",
"include/simdcomputil.h",
"include/simdintegratedbitpacking.h"
]
}

View File

@@ -0,0 +1,182 @@
#!/usr/bin/env python
import sys
def howmany(bit):
""" how many values are we going to pack? """
return 256
def howmanywords(bit):
return (howmany(bit) * bit + 255)/256
def howmanybytes(bit):
return howmanywords(bit) * 16
print("""
/** code generated by avxpacking.py starts here **/
""")
print("""typedef void (*avxpackblockfnc)(const uint32_t * pin, __m256i * compressed);""")
print("""typedef void (*avxunpackblockfnc)(const __m256i * compressed, uint32_t * pout);""")
def plurial(number):
if(number <> 1):
return "s"
else :
return ""
print("")
print("static void avxpackblock0(const uint32_t * pin, __m256i * compressed) {");
print(" (void)compressed;");
print(" (void) pin; /* we consumed {0} 32-bit integer{1} */ ".format(howmany(0),plurial(howmany(0))));
print("}");
print("")
for bit in range(1,33):
print("")
print("/* we are going to pack {0} {1}-bit values, touching {2} 256-bit words, using {3} bytes */ ".format(howmany(bit),bit,howmanywords(bit),howmanybytes(bit)))
print("static void avxpackblock{0}(const uint32_t * pin, __m256i * compressed) {{".format(bit));
print(" const __m256i * in = (const __m256i *) pin;");
print(" /* we are going to touch {0} 256-bit word{1} */ ".format(howmanywords(bit),plurial(howmanywords(bit))));
if(howmanywords(bit) == 1):
print(" __m256i w0;")
else:
print(" __m256i w0, w1;")
if( (bit & (bit-1)) <> 0) : print(" __m256i tmp; /* used to store inputs at word boundary */")
oldword = 0
for j in range(howmany(bit)/8):
firstword = j * bit / 32
if(firstword > oldword):
print(" _mm256_storeu_si256(compressed + {0}, w{1});".format(oldword,oldword%2))
oldword = firstword
secondword = (j * bit + bit - 1)/32
firstshift = (j*bit) % 32
if( firstword == secondword):
if(firstshift == 0):
print(" w{0} = _mm256_lddqu_si256 (in + {1});".format(firstword%2,j))
else:
print(" w{0} = _mm256_or_si256(w{0},_mm256_slli_epi32(_mm256_lddqu_si256 (in + {1}) , {2}));".format(firstword%2,j,firstshift))
else:
print(" tmp = _mm256_lddqu_si256 (in + {0});".format(j))
print(" w{0} = _mm256_or_si256(w{0},_mm256_slli_epi32(tmp , {2}));".format(firstword%2,j,firstshift))
secondshift = 32-firstshift
print(" w{0} = _mm256_srli_epi32(tmp,{2});".format(secondword%2,j,secondshift))
print(" _mm256_storeu_si256(compressed + {0}, w{1});".format(secondword,secondword%2))
print("}");
print("")
print("")
print("static void avxpackblockmask0(const uint32_t * pin, __m256i * compressed) {");
print(" (void)compressed;");
print(" (void) pin; /* we consumed {0} 32-bit integer{1} */ ".format(howmany(0),plurial(howmany(0))));
print("}");
print("")
for bit in range(1,33):
print("")
print("/* we are going to pack {0} {1}-bit values, touching {2} 256-bit words, using {3} bytes */ ".format(howmany(bit),bit,howmanywords(bit),howmanybytes(bit)))
print("static void avxpackblockmask{0}(const uint32_t * pin, __m256i * compressed) {{".format(bit));
print(" /* we are going to touch {0} 256-bit word{1} */ ".format(howmanywords(bit),plurial(howmanywords(bit))));
if(howmanywords(bit) == 1):
print(" __m256i w0;")
else:
print(" __m256i w0, w1;")
print(" const __m256i * in = (const __m256i *) pin;");
if(bit < 32): print(" const __m256i mask = _mm256_set1_epi32({0});".format((1<<bit)-1));
def maskfnc(x):
if(bit == 32): return x
return " _mm256_and_si256 ( mask, {0}) ".format(x)
if( (bit & (bit-1)) <> 0) : print(" __m256i tmp; /* used to store inputs at word boundary */")
oldword = 0
for j in range(howmany(bit)/8):
firstword = j * bit / 32
if(firstword > oldword):
print(" _mm256_storeu_si256(compressed + {0}, w{1});".format(oldword,oldword%2))
oldword = firstword
secondword = (j * bit + bit - 1)/32
firstshift = (j*bit) % 32
loadstr = maskfnc(" _mm256_lddqu_si256 (in + {0}) ".format(j))
if( firstword == secondword):
if(firstshift == 0):
print(" w{0} = {1};".format(firstword%2,loadstr))
else:
print(" w{0} = _mm256_or_si256(w{0},_mm256_slli_epi32({1} , {2}));".format(firstword%2,loadstr,firstshift))
else:
print(" tmp = {0};".format(loadstr))
print(" w{0} = _mm256_or_si256(w{0},_mm256_slli_epi32(tmp , {2}));".format(firstword%2,j,firstshift))
secondshift = 32-firstshift
print(" w{0} = _mm256_srli_epi32(tmp,{2});".format(secondword%2,j,secondshift))
print(" _mm256_storeu_si256(compressed + {0}, w{1});".format(secondword,secondword%2))
print("}");
print("")
print("static void avxunpackblock0(const __m256i * compressed, uint32_t * pout) {");
print(" (void) compressed;");
print(" memset(pout,0,{0});".format(howmany(0)));
print("}");
print("")
for bit in range(1,33):
print("")
print("/* we packed {0} {1}-bit values, touching {2} 256-bit words, using {3} bytes */ ".format(howmany(bit),bit,howmanywords(bit),howmanybytes(bit)))
print("static void avxunpackblock{0}(const __m256i * compressed, uint32_t * pout) {{".format(bit));
print(" /* we are going to access {0} 256-bit word{1} */ ".format(howmanywords(bit),plurial(howmanywords(bit))));
if(howmanywords(bit) == 1):
print(" __m256i w0;")
else:
print(" __m256i w0, w1;")
print(" __m256i * out = (__m256i *) pout;");
if(bit < 32): print(" const __m256i mask = _mm256_set1_epi32({0});".format((1<<bit)-1));
maskstr = " _mm256_and_si256 ( mask, {0}) "
if (bit == 32) : maskstr = " {0} " # no need
oldword = 0
print(" w0 = _mm256_lddqu_si256 (compressed);")
for j in range(howmany(bit)/8):
firstword = j * bit / 32
secondword = (j * bit + bit - 1)/32
if(secondword > oldword):
print(" w{0} = _mm256_lddqu_si256 (compressed + {1});".format(secondword%2,secondword))
oldword = secondword
firstshift = (j*bit) % 32
firstshiftstr = "_mm256_srli_epi32( w{0} , "+str(firstshift)+") "
if(firstshift == 0):
firstshiftstr =" w{0} " # no need
wfirst = firstshiftstr.format(firstword%2)
if( firstword == secondword):
if(firstshift + bit <> 32):
wfirst = maskstr.format(wfirst)
print(" _mm256_storeu_si256(out + {0}, {1});".format(j,wfirst))
else:
secondshift = (32-firstshift)
wsecond = "_mm256_slli_epi32( w{0} , {1} ) ".format((firstword+1)%2,secondshift)
wfirstorsecond = " _mm256_or_si256 ({0},{1}) ".format(wfirst,wsecond)
wfirstorsecond = maskstr.format(wfirstorsecond)
print(" _mm256_storeu_si256(out + {0},\n {1});".format(j,wfirstorsecond))
print("}");
print("")
print("static avxpackblockfnc avxfuncPackArr[] = {")
for bit in range(0,32):
print("&avxpackblock{0},".format(bit))
print("&avxpackblock32")
print("};")
print("static avxpackblockfnc avxfuncPackMaskArr[] = {")
for bit in range(0,32):
print("&avxpackblockmask{0},".format(bit))
print("&avxpackblockmask32")
print("};")
print("static avxunpackblockfnc avxfuncUnpackArr[] = {")
for bit in range(0,32):
print("&avxunpackblock{0},".format(bit))
print("&avxunpackblock32")
print("};")
print("/** code generated by avxpacking.py ends here **/")

152
cpp/simdcomp/scripts/simdfor.py Executable file
View File

@@ -0,0 +1,152 @@
#!/usr/bin/env python3
from math import ceil
print("""
/**
* Blablabla
*
*/
""");
def mask(bit):
return str((1 << bit) - 1)
for length in [32]:
print("""
static __m128i iunpackFOR0(__m128i initOffset, const __m128i * _in , uint32_t * _out) {
__m128i *out = (__m128i*)(_out);
int i;
(void) _in;
for (i = 0; i < 8; ++i) {
_mm_store_si128(out++, initOffset);
_mm_store_si128(out++, initOffset);
_mm_store_si128(out++, initOffset);
_mm_store_si128(out++, initOffset);
}
return initOffset;
}
""")
print("""
static void ipackFOR0(__m128i initOffset , const uint32_t * _in , __m128i * out ) {
(void) initOffset;
(void) _in;
(void) out;
}
""")
for bit in range(1,33):
offsetVar = " initOffset";
print("""
static void ipackFOR"""+str(bit)+"""(__m128i """+offsetVar+""", const uint32_t * _in, __m128i * out) {
const __m128i *in = (const __m128i*)(_in);
__m128i OutReg;
""");
if (bit != 32):
print(" __m128i CurrIn = _mm_load_si128(in);");
print(" __m128i InReg = _mm_sub_epi32(CurrIn, initOffset);");
else:
print(" __m128i InReg = _mm_load_si128(in);");
print(" (void) initOffset;");
inwordpointer = 0
valuecounter = 0
for k in range(ceil((length * bit) / 32)):
if(valuecounter == length): break
for x in range(inwordpointer,32,bit):
if(x!=0) :
print(" OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, " + str(x) + "));");
else:
print(" OutReg = InReg; ");
if((x+bit>=32) ):
while(inwordpointer<32):
inwordpointer += bit
print(" _mm_store_si128(out, OutReg);");
print("");
if(valuecounter + 1 < length):
print(" ++out;")
inwordpointer -= 32;
if(inwordpointer>0):
print(" OutReg = _mm_srli_epi32(InReg, " + str(bit) + " - " + str(inwordpointer) + ");");
if(valuecounter + 1 < length):
print(" ++in;")
if (bit != 32):
print(" CurrIn = _mm_load_si128(in);");
print(" InReg = _mm_sub_epi32(CurrIn, initOffset);");
else:
print(" InReg = _mm_load_si128(in);");
print("");
valuecounter = valuecounter + 1
if(valuecounter == length): break
assert(valuecounter == length)
print("\n}\n\n""")
for bit in range(1,32):
offsetVar = " initOffset";
print("""\n
static __m128i iunpackFOR"""+str(bit)+"""(__m128i """+offsetVar+""", const __m128i* in, uint32_t * _out) {
""");
print(""" __m128i* out = (__m128i*)(_out);
__m128i InReg = _mm_load_si128(in);
__m128i OutReg;
__m128i tmp;
const __m128i mask = _mm_set1_epi32((1U<<"""+str(bit)+""")-1);
""");
MainText = "";
MainText += "\n";
inwordpointer = 0
valuecounter = 0
for k in range(ceil((length * bit) / 32)):
for x in range(inwordpointer,32,bit):
if(valuecounter == length): break
if (x > 0):
MainText += " tmp = _mm_srli_epi32(InReg," + str(x) +");\n";
else:
MainText += " tmp = InReg;\n";
if(x+bit<32):
MainText += " OutReg = _mm_and_si128(tmp, mask);\n";
else:
MainText += " OutReg = tmp;\n";
if((x+bit>=32) ):
while(inwordpointer<32):
inwordpointer += bit
if(valuecounter + 1 < length):
MainText += " ++in;"
MainText += " InReg = _mm_load_si128(in);\n";
inwordpointer -= 32;
if(inwordpointer>0):
MainText += " OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, " + str(bit) + "-" + str(inwordpointer) + "), mask));\n\n";
if (bit != 32):
MainText += " OutReg = _mm_add_epi32(OutReg, initOffset);\n";
MainText += " _mm_store_si128(out++, OutReg);\n\n";
MainText += "";
valuecounter = valuecounter + 1
if(valuecounter == length): break
assert(valuecounter == length)
print(MainText)
print(" return initOffset;");
print("\n}\n\n")
print("""
static __m128i iunpackFOR32(__m128i initvalue , const __m128i* in, uint32_t * _out) {
__m128i * mout = (__m128i *)_out;
__m128i invec;
size_t k;
for(k = 0; k < 128/4; ++k) {
invec = _mm_load_si128(in++);
_mm_store_si128(mout++, invec);
}
return invec;
}
""")

40
cpp/simdcomp/simdcomp.def Normal file
View File

@@ -0,0 +1,40 @@
EXPORTS
simdpack
simdpackwithoutmask
simdunpack
bits
maxbits
maxbits_length
simdmin
simdmin_length
simdmaxmin
simdmaxmin_length
simdmaxbitsd1
simdmaxbitsd1_length
simdpackd1
simdpackwithoutmaskd1
simdunpackd1
simdsearchd1
simdsearchwithlengthd1
simdselectd1
simdpackFOR
simdselectFOR
simdsearchwithlengthFOR
simdunpackFOR
simdmin_length
simdmaxmin
simdmaxmin_length
simdpack_length
simdpackFOR_length
simdunpackFOR_length
simdpack_shortlength
simdfastsetFOR
simdfastset
simdfastsetd1
simdunpack_length
simdunpack_shortlength
simdsearchwithlengthFOR
simdscand1
simdfastsetd1fromprevious
simdfastsetd1

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,234 @@
/**
* This code is released under a BSD License.
*/
#include "simdcomputil.h"
#ifdef __SSE4_1__
#include <smmintrin.h>
#endif
#include <assert.h>
#define Delta(curr, prev) \
_mm_sub_epi32(curr, \
_mm_or_si128(_mm_slli_si128(curr, 4), _mm_srli_si128(prev, 12)))
/* returns the integer logarithm of v (bit width) */
uint32_t bits(const uint32_t v) {
#ifdef _MSC_VER
unsigned long answer;
if (v == 0) {
return 0;
}
_BitScanReverse(&answer, v);
return answer + 1;
#else
return v == 0 ? 0 : 32 - __builtin_clz(v); /* assume GCC-like compiler if not microsoft */
#endif
}
static uint32_t maxbitas32int(const __m128i accumulator) {
const __m128i _tmp1 = _mm_or_si128(_mm_srli_si128(accumulator, 8), accumulator); /* (A,B,C,D) xor (0,0,A,B) = (A,B,C xor A,D xor B)*/
const __m128i _tmp2 = _mm_or_si128(_mm_srli_si128(_tmp1, 4), _tmp1); /* (A,B,C xor A,D xor B) xor (0,0,0,C xor A)*/
uint32_t ans = _mm_cvtsi128_si32(_tmp2);
return bits(ans);
}
SIMDCOMP_PURE uint32_t maxbits(const uint32_t * begin) {
const __m128i* pin = (const __m128i*)(begin);
__m128i accumulator = _mm_loadu_si128(pin);
uint32_t k = 1;
for(; 4*k < SIMDBlockSize; ++k) {
__m128i newvec = _mm_loadu_si128(pin+k);
accumulator = _mm_or_si128(accumulator,newvec);
}
return maxbitas32int(accumulator);
}
static uint32_t orasint(const __m128i accumulator) {
const __m128i _tmp1 = _mm_or_si128(_mm_srli_si128(accumulator, 8), accumulator); /* (A,B,C,D) xor (0,0,A,B) = (A,B,C xor A,D xor B)*/
const __m128i _tmp2 = _mm_or_si128(_mm_srli_si128(_tmp1, 4), _tmp1); /* (A,B,C xor A,D xor B) xor (0,0,0,C xor A)*/
return _mm_cvtsi128_si32(_tmp2);
}
#ifdef __SSE4_1__
static uint32_t minasint(const __m128i accumulator) {
const __m128i _tmp1 = _mm_min_epu32(_mm_srli_si128(accumulator, 8), accumulator); /* (A,B,C,D) xor (0,0,A,B) = (A,B,C xor A,D xor B)*/
const __m128i _tmp2 = _mm_min_epu32(_mm_srli_si128(_tmp1, 4), _tmp1); /* (A,B,C xor A,D xor B) xor (0,0,0,C xor A)*/
return _mm_cvtsi128_si32(_tmp2);
}
static uint32_t maxasint(const __m128i accumulator) {
const __m128i _tmp1 = _mm_max_epu32(_mm_srli_si128(accumulator, 8), accumulator); /* (A,B,C,D) xor (0,0,A,B) = (A,B,C xor A,D xor B)*/
const __m128i _tmp2 = _mm_max_epu32(_mm_srli_si128(_tmp1, 4), _tmp1); /* (A,B,C xor A,D xor B) xor (0,0,0,C xor A)*/
return _mm_cvtsi128_si32(_tmp2);
}
uint32_t simdmin(const uint32_t * in) {
const __m128i* pin = (const __m128i*)(in);
__m128i accumulator = _mm_loadu_si128(pin);
uint32_t k = 1;
for(; 4*k < SIMDBlockSize; ++k) {
__m128i newvec = _mm_loadu_si128(pin+k);
accumulator = _mm_min_epu32(accumulator,newvec);
}
return minasint(accumulator);
}
void simdmaxmin(const uint32_t * in, uint32_t * getmin, uint32_t * getmax) {
const __m128i* pin = (const __m128i*)(in);
__m128i minaccumulator = _mm_loadu_si128(pin);
__m128i maxaccumulator = minaccumulator;
uint32_t k = 1;
for(; 4*k < SIMDBlockSize; ++k) {
__m128i newvec = _mm_loadu_si128(pin+k);
minaccumulator = _mm_min_epu32(minaccumulator,newvec);
maxaccumulator = _mm_max_epu32(maxaccumulator,newvec);
}
*getmin = minasint(minaccumulator);
*getmax = maxasint(maxaccumulator);
}
uint32_t simdmin_length(const uint32_t * in, uint32_t length) {
uint32_t currentmin = 0xFFFFFFFF;
uint32_t lengthdividedby4 = length / 4;
uint32_t offset = lengthdividedby4 * 4;
uint32_t k;
if (lengthdividedby4 > 0) {
const __m128i* pin = (const __m128i*)(in);
__m128i accumulator = _mm_loadu_si128(pin);
k = 1;
for(; 4*k < lengthdividedby4 * 4; ++k) {
__m128i newvec = _mm_loadu_si128(pin+k);
accumulator = _mm_min_epu32(accumulator,newvec);
}
currentmin = minasint(accumulator);
}
for (k = offset; k < length; ++k)
if (in[k] < currentmin)
currentmin = in[k];
return currentmin;
}
void simdmaxmin_length(const uint32_t * in, uint32_t length, uint32_t * getmin, uint32_t * getmax) {
uint32_t lengthdividedby4 = length / 4;
uint32_t offset = lengthdividedby4 * 4;
uint32_t k;
*getmin = 0xFFFFFFFF;
*getmax = 0;
if (lengthdividedby4 > 0) {
const __m128i* pin = (const __m128i*)(in);
__m128i minaccumulator = _mm_loadu_si128(pin);
__m128i maxaccumulator = minaccumulator;
k = 1;
for(; 4*k < lengthdividedby4 * 4; ++k) {
__m128i newvec = _mm_loadu_si128(pin+k);
minaccumulator = _mm_min_epu32(minaccumulator,newvec);
maxaccumulator = _mm_max_epu32(maxaccumulator,newvec);
}
*getmin = minasint(minaccumulator);
*getmax = maxasint(maxaccumulator);
}
for (k = offset; k < length; ++k) {
if (in[k] < *getmin)
*getmin = in[k];
if (in[k] > *getmax)
*getmax = in[k];
}
}
#endif
SIMDCOMP_PURE uint32_t maxbits_length(const uint32_t * in,uint32_t length) {
uint32_t k;
uint32_t lengthdividedby4 = length / 4;
uint32_t offset = lengthdividedby4 * 4;
uint32_t bigxor = 0;
if(lengthdividedby4 > 0) {
const __m128i* pin = (const __m128i*)(in);
__m128i accumulator = _mm_loadu_si128(pin);
k = 1;
for(; 4*k < 4*lengthdividedby4; ++k) {
__m128i newvec = _mm_loadu_si128(pin+k);
accumulator = _mm_or_si128(accumulator,newvec);
}
bigxor = orasint(accumulator);
}
for(k = offset; k < length; ++k)
bigxor |= in[k];
return bits(bigxor);
}
/* maxbit over 128 integers (SIMDBlockSize) with provided initial value */
uint32_t simdmaxbitsd1(uint32_t initvalue, const uint32_t * in) {
__m128i initoffset = _mm_set1_epi32 (initvalue);
const __m128i* pin = (const __m128i*)(in);
__m128i newvec = _mm_loadu_si128(pin);
__m128i accumulator = Delta(newvec , initoffset);
__m128i oldvec = newvec;
uint32_t k = 1;
for(; 4*k < SIMDBlockSize; ++k) {
newvec = _mm_loadu_si128(pin+k);
accumulator = _mm_or_si128(accumulator,Delta(newvec , oldvec));
oldvec = newvec;
}
initoffset = oldvec;
return maxbitas32int(accumulator);
}
/* maxbit over |length| integers with provided initial value */
uint32_t simdmaxbitsd1_length(uint32_t initvalue, const uint32_t * in,
uint32_t length) {
__m128i newvec;
__m128i oldvec;
__m128i initoffset;
__m128i accumulator;
const __m128i *pin;
uint32_t tmparray[4];
uint32_t k = 1;
uint32_t acc;
assert(length > 0);
pin = (const __m128i *)(in);
initoffset = _mm_set1_epi32(initvalue);
switch (length) {
case 1:
newvec = _mm_set1_epi32(in[0]);
break;
case 2:
newvec = _mm_setr_epi32(in[0], in[1], in[1], in[1]);
break;
case 3:
newvec = _mm_setr_epi32(in[0], in[1], in[2], in[2]);
break;
default:
newvec = _mm_loadu_si128(pin);
break;
}
accumulator = Delta(newvec, initoffset);
oldvec = newvec;
/* process 4 integers and build an accumulator */
while (k * 4 + 4 <= length) {
newvec = _mm_loadu_si128(pin + k);
accumulator = _mm_or_si128(accumulator, Delta(newvec, oldvec));
oldvec = newvec;
k++;
}
/* extract the accumulator as an integer */
_mm_storeu_si128((__m128i *)(tmparray), accumulator);
acc = tmparray[0] | tmparray[1] | tmparray[2] | tmparray[3];
/* now process the remaining integers */
for (k *= 4; k < length; k++)
acc |= in[k] - (k == 0 ? initvalue : in[k - 1]);
/* return the number of bits */
return bits(acc);
}

14501
cpp/simdcomp/src/simdfor.c Normal file

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

900
cpp/simdcomp/tests/unit.c Normal file
View File

@@ -0,0 +1,900 @@
/**
* This code is released under a BSD License.
*/
#include <assert.h>
#include <stdio.h>
#include <stdlib.h>
#include "simdcomp.h"
int testshortpack() {
int bit;
size_t i;
size_t length;
__m128i * bb;
srand(0);
printf("testshortpack\n");
for (bit = 0; bit < 32; ++bit) {
const size_t N = 128;
uint32_t * data = malloc(N * sizeof(uint32_t));
uint32_t * backdata = malloc(N * sizeof(uint32_t));
uint32_t * buffer = malloc((2 * N + 1024) * sizeof(uint32_t));
for (i = 0; i < N; ++i) {
data[i] = rand() & ((1 << bit) - 1);
}
for (length = 0; length <= N; ++length) {
for (i = 0; i < N; ++i) {
backdata[i] = 0;
}
bb = simdpack_shortlength(data, length, (__m128i *) buffer,
bit);
if((bb - (__m128i *) buffer) * sizeof(__m128i) != (unsigned) simdpack_compressedbytes(length,bit)) {
printf("bug\n");
return -1;
}
simdunpack_shortlength((__m128i *) buffer, length,
backdata, bit);
for (i = 0; i < length; ++i) {
if (data[i] != backdata[i]) {
printf("bug\n");
return -1;
}
}
}
free(data);
free(backdata);
free(buffer);
}
return 0;
}
int testlongpack() {
int bit;
size_t i;
size_t length;
__m128i * bb;
srand(0);
printf("testlongpack\n");
for (bit = 0; bit < 32; ++bit) {
const size_t N = 2048;
uint32_t * data = malloc(N * sizeof(uint32_t));
uint32_t * backdata = malloc(N * sizeof(uint32_t));
uint32_t * buffer = malloc((2 * N + 1024) * sizeof(uint32_t));
for (i = 0; i < N; ++i) {
data[i] = rand() & ((1 << bit) - 1);
}
for (length = 0; length <= N; ++length) {
for (i = 0; i < N; ++i) {
backdata[i] = 0;
}
bb = simdpack_length(data, length, (__m128i *) buffer,
bit);
if((bb - (__m128i *) buffer) * sizeof(__m128i) != (unsigned) simdpack_compressedbytes(length,bit)) {
printf("bug\n");
return -1;
}
simdunpack_length((__m128i *) buffer, length,
backdata, bit);
for (i = 0; i < length; ++i) {
if (data[i] != backdata[i]) {
printf("bug\n");
return -1;
}
}
}
free(data);
free(backdata);
free(buffer);
}
return 0;
}
int testset() {
int bit;
size_t i;
const size_t N = 128;
uint32_t * data = malloc(N * sizeof(uint32_t));
uint32_t * backdata = malloc(N * sizeof(uint32_t));
uint32_t * buffer = malloc((2 * N + 1024) * sizeof(uint32_t));
srand(0);
for (bit = 0; bit < 32; ++bit) {
printf("simple set %d \n",bit);
for (i = 0; i < N; ++i) {
data[i] = rand() & ((1 << bit) - 1);
}
for (i = 0; i < N; ++i) {
backdata[i] = 0;
}
simdpack(data, (__m128i *) buffer, bit);
simdunpack((__m128i *) buffer, backdata, bit);
for (i = 0; i < N; ++i) {
if (data[i] != backdata[i]) {
printf("bug\n");
return -1;
}
}
for(i = N ; i > 0; i--) {
simdfastset((__m128i *) buffer, bit, data[N - i], i - 1);
}
simdunpack((__m128i *) buffer, backdata, bit);
for (i = 0; i < N; ++i) {
if (data[i] != backdata[N - i - 1]) {
printf("bug\n");
return -1;
}
}
simdpack(data, (__m128i *) buffer, bit);
for(i = 1 ; i <= N; i++) {
simdfastset((__m128i *) buffer, bit, data[i - 1], i - 1);
}
simdunpack((__m128i *) buffer, backdata, bit);
for (i = 0; i < N; ++i) {
if (data[i] != backdata[i]) {
printf("bug\n");
return -1;
}
}
}
free(data);
free(backdata);
free(buffer);
return 0;
}
#ifdef __SSE4_1__
int testsetd1() {
int bit;
size_t i;
uint32_t newvalue;
const size_t N = 128;
uint32_t * data = malloc(N * sizeof(uint32_t));
uint32_t * datazeroes = malloc(N * sizeof(uint32_t));
uint32_t * backdata = malloc(N * sizeof(uint32_t));
uint32_t * buffer = malloc((2 * N + 1024) * sizeof(uint32_t));
srand(0);
for (bit = 0; bit < 32; ++bit) {
printf("simple set d1 %d \n",bit);
data[0] = rand() & ((1 << bit) - 1);
datazeroes[0] = 0;
for (i = 1; i < N; ++i) {
data[i] = data[i - 1] + (rand() & ((1 << bit) - 1));
datazeroes[i] = 0;
}
for (i = 0; i < N; ++i) {
backdata[i] = 0;
}
simdpackd1(0,datazeroes, (__m128i *) buffer, bit);
for(i = 1 ; i <= N; i++) {
simdfastsetd1(0,(__m128i *) buffer, bit, data[i - 1], i - 1);
newvalue = simdselectd1(0, (const __m128i *) buffer, bit,i - 1);
if( newvalue != data[i-1] ) {
printf("bad set-select\n");
return -1;
}
}
simdunpackd1(0,(__m128i *) buffer, backdata, bit);
for (i = 0; i < N; ++i) {
if (data[i] != backdata[i])
return -1;
}
}
free(data);
free(backdata);
free(buffer);
free(datazeroes);
return 0;
}
#endif
int testsetFOR() {
int bit;
size_t i;
uint32_t newvalue;
const size_t N = 128;
uint32_t * data = malloc(N * sizeof(uint32_t));
uint32_t * datazeroes = malloc(N * sizeof(uint32_t));
uint32_t * backdata = malloc(N * sizeof(uint32_t));
uint32_t * buffer = malloc((2 * N + 1024) * sizeof(uint32_t));
srand(0);
for (bit = 0; bit < 32; ++bit) {
printf("simple set FOR %d \n",bit);
for (i = 0; i < N; ++i) {
data[i] = (rand() & ((1 << bit) - 1));
datazeroes[i] = 0;
}
for (i = 0; i < N; ++i) {
backdata[i] = 0;
}
simdpackFOR(0,datazeroes, (__m128i *) buffer, bit);
for(i = 1 ; i <= N; i++) {
simdfastsetFOR(0,(__m128i *) buffer, bit, data[i - 1], i - 1);
newvalue = simdselectFOR(0, (const __m128i *) buffer, bit,i - 1);
if( newvalue != data[i-1] ) {
printf("bad set-select\n");
return -1;
}
}
simdunpackFOR(0,(__m128i *) buffer, backdata, bit);
for (i = 0; i < N; ++i) {
if (data[i] != backdata[i])
return -1;
}
}
free(data);
free(backdata);
free(buffer);
free(datazeroes);
return 0;
}
int testshortFORpack() {
int bit;
size_t i;
__m128i * rb;
size_t length;
uint32_t offset = 7;
srand(0);
for (bit = 0; bit < 32; ++bit) {
const size_t N = 128;
uint32_t * data = malloc(N * sizeof(uint32_t));
uint32_t * backdata = malloc(N * sizeof(uint32_t));
uint32_t * buffer = malloc((2 * N + 1024) * sizeof(uint32_t));
for (i = 0; i < N; ++i) {
data[i] = (rand() & ((1 << bit) - 1)) + offset;
}
for (length = 0; length <= N; ++length) {
for (i = 0; i < N; ++i) {
backdata[i] = 0;
}
rb = simdpackFOR_length(offset,data, length, (__m128i *) buffer,
bit);
if(((rb - (__m128i *) buffer)*sizeof(__m128i)) != (unsigned) simdpackFOR_compressedbytes(length,bit)) {
return -1;
}
simdunpackFOR_length(offset,(__m128i *) buffer, length,
backdata, bit);
for (i = 0; i < length; ++i) {
if (data[i] != backdata[i])
return -1;
}
}
free(data);
free(backdata);
free(buffer);
}
return 0;
}
#ifdef __AVX2__
int testbabyavx() {
int bit;
int trial;
unsigned int i,j;
const size_t N = AVXBlockSize;
srand(0);
printf("testbabyavx\n");
printf("bit = ");
for (bit = 0; bit < 32; ++bit) {
printf(" %d ",bit);
fflush(stdout);
for(trial = 0; trial < 100; ++trial) {
uint32_t * data = malloc(N * sizeof(uint32_t)+ 64 * sizeof(uint32_t));
uint32_t * backdata = malloc(N * sizeof(uint32_t) + 64 * sizeof(uint32_t) );
__m256i * buffer = malloc((2 * N + 1024) * sizeof(uint32_t) + 32);
for (i = 0; i < N; ++i) {
data[i] = rand() & ((uint32_t)(1 << bit) - 1);
}
for (i = 0; i < N; ++i) {
backdata[i] = 0;
}
if(avxmaxbits(data) != maxbits_length(data,N)) {
printf("avxmaxbits is buggy\n");
return -1;
}
avxpackwithoutmask(data, buffer, bit);
avxunpack(buffer, backdata, bit);
for (i = 0; i < AVXBlockSize; ++i) {
if (data[i] != backdata[i]) {
printf("bug\n");
for (j = 0; j < N; ++j) {
if (data[j] != backdata[j]) {
printf("data[%d]=%d v.s. backdata[%d]=%d\n",j,data[j],j,backdata[j]);
} else {
printf("data[%d]=%d\n",j,data[j]);
}
}
return -1;
}
}
free(data);
free(backdata);
free(buffer);
}
}
printf("\n");
return 0;
}
int testavx2() {
int N = 5000 * AVXBlockSize, gap;
__m256i * buffer = malloc(AVXBlockSize * sizeof(uint32_t));
uint32_t * datain = malloc(N * sizeof(uint32_t));
uint32_t * backbuffer = malloc(AVXBlockSize * sizeof(uint32_t));
for (gap = 1; gap <= 387420489; gap *= 3) {
int k;
printf(" gap = %u \n", gap);
for (k = 0; k < N; ++k)
datain[k] = k * gap;
for (k = 0; k * AVXBlockSize < N; ++k) {
/*
First part works for general arrays (sorted or unsorted)
*/
int j;
/* we compute the bit width */
const uint32_t b = avxmaxbits(datain + k * AVXBlockSize);
if(avxmaxbits(datain + k * AVXBlockSize) != maxbits_length(datain + k * AVXBlockSize,AVXBlockSize)) {
printf("avxmaxbits is buggy %d %d \n",
avxmaxbits(datain + k * AVXBlockSize),
maxbits_length(datain + k * AVXBlockSize,AVXBlockSize));
return -1;
}
printf("bit width = %d\n",b);
/* we read 256 integers at "datain + k * AVXBlockSize" and
write b 256-bit vectors at "buffer" */
avxpackwithoutmask(datain + k * AVXBlockSize, buffer, b);
/* we read back b1 128-bit vectors at "buffer" and write 128 integers at backbuffer */
avxunpack(buffer, backbuffer, b);/* uncompressed */
for (j = 0; j < AVXBlockSize; ++j) {
if (backbuffer[j] != datain[k * AVXBlockSize + j]) {
int i;
printf("bug in avxpack\n");
for(i = 0; i < AVXBlockSize; ++i) {
printf("data[%d]=%d got back %d %s\n",i,
datain[k * AVXBlockSize + i],backbuffer[i],
datain[k * AVXBlockSize + i]!=backbuffer[i]?"bug":"");
}
return -2;
}
}
}
}
free(buffer);
free(datain);
free(backbuffer);
printf("Code looks good.\n");
return 0;
}
#endif /* avx2 */
int test() {
int N = 5000 * SIMDBlockSize, gap;
__m128i * buffer = malloc(SIMDBlockSize * sizeof(uint32_t));
uint32_t * datain = malloc(N * sizeof(uint32_t));
uint32_t * backbuffer = malloc(SIMDBlockSize * sizeof(uint32_t));
for (gap = 1; gap <= 387420489; gap *= 3) {
int k;
printf(" gap = %u \n", gap);
for (k = 0; k < N; ++k)
datain[k] = k * gap;
for (k = 0; k * SIMDBlockSize < N; ++k) {
/*
First part works for general arrays (sorted or unsorted)
*/
int j;
/* we compute the bit width */
const uint32_t b = maxbits(datain + k * SIMDBlockSize);
/* we read 128 integers at "datain + k * SIMDBlockSize" and
write b 128-bit vectors at "buffer" */
simdpackwithoutmask(datain + k * SIMDBlockSize, buffer, b);
/* we read back b1 128-bit vectors at "buffer" and write 128 integers at backbuffer */
simdunpack(buffer, backbuffer, b);/* uncompressed */
for (j = 0; j < SIMDBlockSize; ++j) {
if (backbuffer[j] != datain[k * SIMDBlockSize + j]) {
printf("bug in simdpack\n");
return -2;
}
}
{
/*
next part assumes that the data is sorted (uses differential coding)
*/
uint32_t offset = 0;
/* we compute the bit width */
const uint32_t b1 = simdmaxbitsd1(offset,
datain + k * SIMDBlockSize);
/* we read 128 integers at "datain + k * SIMDBlockSize" and
write b1 128-bit vectors at "buffer" */
simdpackwithoutmaskd1(offset, datain + k * SIMDBlockSize, buffer,
b1);
/* we read back b1 128-bit vectors at "buffer" and write 128 integers at backbuffer */
simdunpackd1(offset, buffer, backbuffer, b1);
for (j = 0; j < SIMDBlockSize; ++j) {
if (backbuffer[j] != datain[k * SIMDBlockSize + j]) {
printf("bug in simdpack d1\n");
return -3;
}
}
offset = datain[k * SIMDBlockSize + SIMDBlockSize - 1];
}
}
}
free(buffer);
free(datain);
free(backbuffer);
printf("Code looks good.\n");
return 0;
}
#ifdef __SSE4_1__
int testFOR() {
int N = 5000 * SIMDBlockSize, gap;
__m128i * buffer = malloc(SIMDBlockSize * sizeof(uint32_t));
uint32_t * datain = malloc(N * sizeof(uint32_t));
uint32_t * backbuffer = malloc(SIMDBlockSize * sizeof(uint32_t));
uint32_t tmax, tmin, tb;
for (gap = 1; gap <= 387420489; gap *= 2) {
int k;
printf(" gap = %u \n", gap);
for (k = 0; k < N; ++k)
datain[k] = k * gap;
for (k = 0; k * SIMDBlockSize < N; ++k) {
int j;
simdmaxmin_length(datain + k * SIMDBlockSize,SIMDBlockSize,&tmin,&tmax);
/* we compute the bit width */
tb = bits(tmax - tmin);
/* we read 128 integers at "datain + k * SIMDBlockSize" and
write b 128-bit vectors at "buffer" */
simdpackFOR(tmin,datain + k * SIMDBlockSize, buffer, tb);
for (j = 0; j < SIMDBlockSize; ++j) {
uint32_t selectedvalue = simdselectFOR(tmin,buffer,tb,j);
if (selectedvalue != datain[k * SIMDBlockSize + j]) {
printf("bug in simdselectFOR\n");
return -3;
}
}
/* we read back b1 128-bit vectors at "buffer" and write 128 integers at backbuffer */
simdunpackFOR(tmin,buffer, backbuffer, tb);/* uncompressed */
for (j = 0; j < SIMDBlockSize; ++j) {
if (backbuffer[j] != datain[k * SIMDBlockSize + j]) {
printf("bug in simdpackFOR\n");
return -2;
}
}
}
}
free(buffer);
free(datain);
free(backbuffer);
printf("Code looks good.\n");
return 0;
}
#endif
#define MAX 300
int test_simdmaxbitsd1_length() {
uint32_t result, buffer[MAX + 1];
int i, j;
memset(&buffer[0], 0xff, sizeof(buffer));
/* this test creates buffers of different length; each buffer is
* initialized to result in the following deltas:
* length 1: 2
* length 2: 1 2
* length 3: 1 1 2
* length 4: 1 1 1 2
* length 5: 1 1 1 1 2
* etc. Each sequence's "maxbits" is 2. */
for (i = 0; i < MAX; i++) {
for (j = 0; j < i; j++)
buffer[j] = j + 1;
buffer[i] = i + 2;
result = simdmaxbitsd1_length(0, &buffer[0], i + 1);
if (result != 2) {
printf("simdmaxbitsd1_length: unexpected result %u in loop %d\n",
result, i);
return -1;
}
}
printf("simdmaxbitsd1_length: ok\n");
return 0;
}
int uint32_cmp(const void *a, const void *b)
{
const uint32_t *ia = (const uint32_t *)a;
const uint32_t *ib = (const uint32_t *)b;
if(*ia < *ib)
return -1;
else if (*ia > *ib)
return 1;
return 0;
}
#ifdef __SSE4_1__
int test_simdpackedsearch() {
uint32_t buffer[128];
uint32_t result = 0;
int b, i;
uint32_t init = 0;
__m128i initial = _mm_set1_epi32(init);
/* initialize the buffer */
for (i = 0; i < 128; i++)
buffer[i] = (uint32_t)(i + 1);
/* this test creates delta encoded buffers with different bits, then
* performs lower bound searches for each key */
for (b = 1; b <= 32; b++) {
uint32_t out[128];
/* delta-encode to 'i' bits */
simdpackwithoutmaskd1(init, buffer, (__m128i *)out, b);
initial = _mm_setzero_si128();
printf("simdsearchd1: %d bits\n", b);
/* now perform the searches */
initial = _mm_set1_epi32(init);
assert(simdsearchd1(&initial, (__m128i *)out, b, 0, &result) == 0);
assert(result > 0);
for (i = 1; i <= 128; i++) {
initial = _mm_set1_epi32(init);
assert(simdsearchd1(&initial, (__m128i *)out, b,
(uint32_t)i, &result) == i - 1);
assert(result == (unsigned)i);
}
initial = _mm_set1_epi32(init);
assert(simdsearchd1(&initial, (__m128i *)out, b, 200, &result)
== 128);
assert(result > 200);
}
printf("simdsearchd1: ok\n");
return 0;
}
int test_simdpackedsearchFOR() {
uint32_t buffer[128];
uint32_t result = 0;
int b;
uint32_t i;
uint32_t maxv, tmin, tmax, tb;
uint32_t out[128];
/* this test creates delta encoded buffers with different bits, then
* performs lower bound searches for each key */
for (b = 1; b <= 32; b++) {
/* initialize the buffer */
maxv = (b == 32)
? 0xFFFFFFFF
: ((1U<<b) - 1);
for (i = 0; i < 128; i++)
buffer[i] = maxv * (i + 1) / 128;
simdmaxmin_length(buffer,SIMDBlockSize,&tmin,&tmax);
/* we compute the bit width */
tb = bits(tmax - tmin);
/* delta-encode to 'i' bits */
simdpackFOR(tmin, buffer, (__m128i *)out, tb);
printf("simdsearchd1: %d bits\n", b);
/* now perform the searches */
for (i = 0; i < 128; i++) {
assert(buffer[i] == simdselectFOR(tmin, (__m128i *)out, tb,i));
}
for (i = 0; i < 128; i++) {
int x = simdsearchwithlengthFOR(tmin, (__m128i *)out, tb,
128,buffer[i], &result) ;
assert(simdselectFOR(tmin, (__m128i *)out, tb,x) == buffer[x]);
assert(simdselectFOR(tmin, (__m128i *)out, tb,x) == result);
assert(buffer[x] == result);
assert(result == buffer[i]);
assert(buffer[x] == buffer[i]);
}
}
printf("simdsearchFOR: ok\n");
return 0;
}
int test_simdpackedsearch_advanced() {
uint32_t buffer[128];
uint32_t backbuffer[128];
uint32_t out[128];
uint32_t result = 0;
uint32_t b, i;
uint32_t init = 0;
__m128i initial = _mm_set1_epi32(init);
/* this test creates delta encoded buffers with different bits, then
* performs lower bound searches for each key */
for (b = 0; b <= 32; b++) {
uint32_t prev = init;
/* initialize the buffer */
for (i = 0; i < 128; i++) {
buffer[i] = ((uint32_t)(1431655765 * i + 0xFFFFFFFF)) ;
if(b < 32) buffer[i] %= (1<<b);
}
qsort(buffer,128, sizeof(uint32_t), uint32_cmp);
for (i = 0; i < 128; i++) {
buffer[i] = buffer[i] + prev;
prev = buffer[i];
}
for (i = 1; i < 128; i++) {
if(buffer[i] < buffer[i-1] )
buffer[i] = buffer[i-1];
}
assert(simdmaxbitsd1(init, buffer)<=b);
for (i = 0; i < 128; i++) {
out[i] = 0; /* memset would do too */
}
/* delta-encode to 'i' bits */
simdpackwithoutmaskd1(init, buffer, (__m128i *)out, b);
simdunpackd1(init, (__m128i *)out, backbuffer, b);
for (i = 0; i < 128; i++) {
assert(buffer[i] == backbuffer[i]);
}
printf("advanced simdsearchd1: %d bits\n", b);
for (i = 0; i < 128; i++) {
int pos;
initial = _mm_set1_epi32(init);
pos = simdsearchd1(&initial, (__m128i *)out, b,
buffer[i], &result);
assert(pos == simdsearchwithlengthd1(init, (__m128i *)out, b, 128,
buffer[i], &result));
assert(buffer[pos] == buffer[i]);
if(pos > 0)
assert(buffer[pos - 1] < buffer[i]);
assert(result == buffer[i]);
}
for (i = 0; i < 128; i++) {
int pos;
if(buffer[i] == 0) continue;
initial = _mm_set1_epi32(init);
pos = simdsearchd1(&initial, (__m128i *)out, b,
buffer[i] - 1, &result);
assert(pos == simdsearchwithlengthd1(init, (__m128i *)out, b, 128,
buffer[i] - 1, &result));
assert(buffer[pos] >= buffer[i] - 1);
if(pos > 0)
assert(buffer[pos - 1] < buffer[i] - 1);
assert(result == buffer[pos]);
}
for (i = 0; i < 128; i++) {
int pos;
if (buffer[i] + 1 == 0)
continue;
initial = _mm_set1_epi32(init);
pos = simdsearchd1(&initial, (__m128i *) out, b,
buffer[i] + 1, &result);
assert(pos == simdsearchwithlengthd1(init, (__m128i *)out, b, 128,
buffer[i] + 1, &result));
if(pos == 128) {
assert(buffer[i] == buffer[127]);
} else {
assert(buffer[pos] >= buffer[i] + 1);
if (pos > 0)
assert(buffer[pos - 1] < buffer[i] + 1);
assert(result == buffer[pos]);
}
}
}
printf("advanced simdsearchd1: ok\n");
return 0;
}
int test_simdpackedselect() {
uint32_t buffer[128];
uint32_t initial = 33;
int b, i;
/* initialize the buffer */
for (i = 0; i < 128; i++)
buffer[i] = (uint32_t)(initial + i);
/* this test creates delta encoded buffers with different bits, then
* performs lower bound searches for each key */
for (b = 1; b <= 32; b++) {
uint32_t out[128];
/* delta-encode to 'i' bits */
simdpackwithoutmaskd1(initial, buffer, (__m128i *)out, b);
printf("simdselectd1: %d bits\n", b);
/* now perform the searches */
for (i = 0; i < 128; i++) {
assert(simdselectd1(initial, (__m128i *)out, b, (uint32_t)i)
== initial + i);
}
}
printf("simdselectd1: ok\n");
return 0;
}
int test_simdpackedselect_advanced() {
uint32_t buffer[128];
uint32_t initial = 33;
uint32_t b;
int i;
/* this test creates delta encoded buffers with different bits, then
* performs lower bound searches for each key */
for (b = 0; b <= 32; b++) {
uint32_t prev = initial;
uint32_t out[128];
/* initialize the buffer */
for (i = 0; i < 128; i++) {
buffer[i] = ((uint32_t)(165576 * i)) ;
if(b < 32) buffer[i] %= (1<<b);
}
for (i = 0; i < 128; i++) {
buffer[i] = buffer[i] + prev;
prev = buffer[i];
}
for (i = 1; i < 128; i++) {
if(buffer[i] < buffer[i-1] )
buffer[i] = buffer[i-1];
}
assert(simdmaxbitsd1(initial, buffer)<=b);
for (i = 0; i < 128; i++) {
out[i] = 0; /* memset would do too */
}
/* delta-encode to 'i' bits */
simdpackwithoutmaskd1(initial, buffer, (__m128i *)out, b);
printf("simdselectd1: %d bits\n", b);
/* now perform the searches */
for (i = 0; i < 128; i++) {
uint32_t valretrieved = simdselectd1(initial, (__m128i *)out, b, (uint32_t)i);
assert(valretrieved == buffer[i]);
}
}
printf("advanced simdselectd1: ok\n");
return 0;
}
#endif
int main() {
int r;
r = testsetFOR();
if (r) {
printf("test failure 1\n");
return r;
}
#ifdef __SSE4_1__
r = testsetd1();
if (r) {
printf("test failure 2\n");
return r;
}
#endif
r = testset();
if (r) {
printf("test failure 3\n");
return r;
}
r = testshortFORpack();
if (r) {
printf("test failure 4\n");
return r;
}
r = testshortpack();
if (r) {
printf("test failure 5\n");
return r;
}
r = testlongpack();
if (r) {
printf("test failure 6\n");
return r;
}
#ifdef __SSE4_1__
r = test_simdpackedsearchFOR();
if (r) {
printf("test failure 7\n");
return r;
}
r = testFOR();
if (r) {
printf("test failure 8\n");
return r;
}
#endif
#ifdef __AVX2__
r= testbabyavx();
if (r) {
printf("test failure baby avx\n");
return r;
}
r = testavx2();
if (r) {
printf("test failure 9 avx\n");
return r;
}
#endif
r = test();
if (r) {
printf("test failure 9\n");
return r;
}
r = test_simdmaxbitsd1_length();
if (r) {
printf("test failure 10\n");
return r;
}
#ifdef __SSE4_1__
r = test_simdpackedsearch();
if (r) {
printf("test failure 11\n");
return r;
}
r = test_simdpackedsearch_advanced();
if (r) {
printf("test failure 12\n");
return r;
}
r = test_simdpackedselect();
if (r) {
printf("test failure 13\n");
return r;
}
r = test_simdpackedselect_advanced();
if (r) {
printf("test failure 14\n");
return r;
}
#endif
printf("All tests OK!\n");
return 0;
}

View File

@@ -0,0 +1,102 @@
/**
* This code is released under a BSD License.
*/
#include <stdio.h>
#include <stdlib.h>
#include <time.h>
#include "simdcomp.h"
#define get_random_char() (uint8_t)(rand() % 256);
int main() {
int N = 5000 * SIMDBlockSize, gap;
__m128i * buffer = malloc(SIMDBlockSize * sizeof(uint32_t));
uint32_t * datain = malloc(N * sizeof(uint32_t));
uint32_t * backbuffer = malloc(SIMDBlockSize * sizeof(uint32_t));
srand(time(NULL));
for (gap = 1; gap <= 387420489; gap *= 3) {
int k;
printf(" gap = %u \n", gap);
/* simulate some random character string, don't care about endiannes */
for (k = 0; k < N; ++k) {
uint8_t _tmp[4];
_tmp[0] = get_random_char();
_tmp[1] = get_random_char();
_tmp[2] = get_random_char();
_tmp[3] = get_random_char();
memmove(&datain[k], _tmp, 4);
}
for (k = 0; k * SIMDBlockSize < N; ++k) {
/*
First part works for general arrays (sorted or unsorted)
*/
int j;
/* we compute the bit width */
const uint32_t b = maxbits(datain + k * SIMDBlockSize);
/* we read 128 integers at "datain + k * SIMDBlockSize" and
write b 128-bit vectors at "buffer" */
simdpackwithoutmask(datain + k * SIMDBlockSize, buffer, b);
/* we read back b1 128-bit vectors at "buffer" and write 128 integers at backbuffer */
simdunpack(buffer, backbuffer, b);/* uncompressed */
for (j = 0; j < SIMDBlockSize; ++j) {
uint8_t chars_back[4];
uint8_t chars_in[4];
memmove(chars_back, &backbuffer[j], 4);
memmove(chars_in, &datain[k * SIMDBlockSize + j], 4);
if (chars_in[0] != chars_back[0]
|| chars_in[1] != chars_back[1]
|| chars_in[2] != chars_back[2]
|| chars_in[3] != chars_back[3]) {
printf("bug in simdpack\n");
return -2;
}
}
{
/*
next part assumes that the data is sorted (uses differential coding)
*/
uint32_t offset = 0;
/* we compute the bit width */
const uint32_t b1 = simdmaxbitsd1(offset,
datain + k * SIMDBlockSize);
/* we read 128 integers at "datain + k * SIMDBlockSize" and
write b1 128-bit vectors at "buffer" */
simdpackwithoutmaskd1(offset, datain + k * SIMDBlockSize, buffer,
b1);
/* we read back b1 128-bit vectors at "buffer" and write 128 integers at backbuffer */
simdunpackd1(offset, buffer, backbuffer, b1);
for (j = 0; j < SIMDBlockSize; ++j) {
uint8_t chars_back[4];
uint8_t chars_in[4];
memmove(chars_back, &backbuffer[j], 4);
memmove(chars_in, &datain[k * SIMDBlockSize + j], 4);
if (chars_in[0] != chars_back[0]
|| chars_in[1] != chars_back[1]
|| chars_in[2] != chars_back[2]
|| chars_in[3] != chars_back[3]) {
printf("bug in simdpack\n");
return -3;
}
}
offset = datain[k * SIMDBlockSize + SIMDBlockSize - 1];
}
}
}
free(buffer);
free(datain);
free(backbuffer);
printf("Code looks good.\n");
return 0;
}