Merge commit 'f07ccd6e4fbc5bbfeb94d40e0f14bc527a7d5439' as 'cpp/simdcomp'

2026-05-20 10:10:42 +00:00 · 2017-01-26 20:28:23 +01:00
parent f19f8757de f07ccd6e4f
commit 9f02b090dd
32 changed files with 95877 additions and 0 deletions
--- a/cpp/simdcomp/.gitignore
+++ b/cpp/simdcomp/.gitignore
@@ -0,0 +1,9 @@
+Makefile.in
+lib*
+unit*
+*.o
+src/*.lo
+src/*.o
+src/.deps
+src/.dirstamp
+src/.libs
--- a/cpp/simdcomp/.travis.yml
+++ b/cpp/simdcomp/.travis.yml
@@ -0,0 +1,11 @@
+language: c
+sudo: false
+compiler:
+  - gcc
+  - clang
+
+branches:
+  only:
+    - master
+
+script: make && ./unit
--- a/cpp/simdcomp/CHANGELOG
+++ b/cpp/simdcomp/CHANGELOG
@@ -0,0 +1,9 @@
+Upcoming
+  - added missing include
+  - improved portability (MSVC)
+  - implemented C89 compatibility
+Version 0.0.3 (19 May 2014)
+  - improved documentation
+Version 0.0.2 (6 February 2014)
+  - added go demo
+Version 0.0.1  (5 February 2014)
--- a/cpp/simdcomp/LICENSE
+++ b/cpp/simdcomp/LICENSE
@@ -0,0 +1,27 @@
+Copyright (c) 2014--, The authors
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without modification,
+are permitted provided that the following conditions are met:
+
+* Redistributions of source code must retain the above copyright notice, this
+  list of conditions and the following disclaimer.
+
+* Redistributions in binary form must reproduce the above copyright notice, this
+  list of conditions and the following disclaimer in the documentation and/or
+  other materials provided with the distribution.
+
+* Neither the name of the {organization} nor the names of its
+  contributors may be used to endorse or promote products derived from
+  this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR
+ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
+ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
--- a/cpp/simdcomp/README.md
+++ b/cpp/simdcomp/README.md
@@ -0,0 +1,137 @@
+The SIMDComp library
+====================
+[![Build Status](https://travis-ci.org/lemire/simdcomp.png)](https://travis-ci.org/lemire/simdcomp)
+
+A simple C library for compressing lists of integers using binary packing and SIMD instructions.
+The assumption is either that you have a list of 32-bit integers where most of them are small, or a list of 32-bit integers where differences between successive integers are small. No software is able to reliably compress an array of 32-bit random numbers.
+
+This library can decode at least 4 billions of compressed integers per second on most
+desktop or laptop processors. That is, it can decompress data at a rate of 15 GB/s.
+This is significantly faster than generic codecs like gzip, LZO, Snappy or LZ4.
+
+On a Skylake Intel processor, it can decode integers at a rate 0.3 cycles per integer,
+which can easily translate into more than 8 decoded billions integers per second.
+
+Contributors: Daniel Lemire, Nathan Kurz, Christoph Rupp, Anatol Belski, Nick White and others
+
+What is it for?
+-------------
+
+This is a low-level library for fast integer compression. By design it does not define a compressed
+format. It is up to the (sophisticated) user to create a compressed format.
+
+Requirements
+-------------
+
+- Your processor should support SSE4.1 (It is supported by most Intel and AMD processors released since 2008.)
+- It is possible to build the core part of the code if your processor support SSE2 (Pentium4 or better)
+- C99 compliant compiler (GCC is assumed)
+- A Linux-like distribution is assumed by the makefile
+
+For a plain C version that does not use SIMD instructions, see https://github.com/lemire/LittleIntPacker
+
+Usage
+-------
+
+Compression works over blocks of 128 integers.
+
+For a complete working example, see example.c (you can build it and
+run it with "make example; ./example").
+
+
+
+1) Lists of integers in random order.
+
+```C            
+const uint32_t b = maxbits(datain);// computes bit width
+simdpackwithoutmask(datain, buffer, b);//compressed to buffer, compressing 128 32-bit integers down to b*32 bytes
+simdunpack(buffer, backbuffer, b);//uncompressed to backbuffer
+```
+
+While 128 32-bit integers are read, only b 128-bit words are written. Thus, the compression ratio is 32/b.
+
+2) Sorted lists of integers.
+
+We used differential coding: we store the difference between successive integers. For this purpose, we need an initial value (called offset).
+
+```C            
+uint32_t offset = 0;
+uint32_t b1 = simdmaxbitsd1(offset,datain); // bit width
+simdpackwithoutmaskd1(offset, datain, buffer, b1);//compressing 128 32-bit integers down to b1*32 bytes
+simdunpackd1(offset, buffer, backbuffer, b1);//uncompressed
+```
+
+General example for arrays of arbitrary length:
+```C
+int compress_decompress_demo() {
+  size_t k, N = 9999;
+  __m128i * endofbuf;
+  uint32_t * datain = malloc(N * sizeof(uint32_t));
+  uint8_t * buffer;
+  uint32_t * backbuffer = malloc(N * sizeof(uint32_t));
+  uint32_t b;
+
+  for (k = 0; k < N; ++k){        /* start with k=0, not k=1! */
+    datain[k] = k;
+  }
+
+  b = maxbits_length(datain, N);
+  buffer = malloc(simdpack_compressedbytes(N,b)); // allocate just enough memory
+  endofbuf = simdpack_length(datain, N, (__m128i *)buffer, b);
+  /* compressed data is stored between buffer and endofbuf using (endofbuf-buffer)*sizeof(__m128i) bytes */
+  /* would be safe to do : buffer = realloc(buffer,(endofbuf-(__m128i *)buffer)*sizeof(__m128i)); */
+  simdunpack_length((const __m128i *)buffer, N, backbuffer, b);
+
+  for (k = 0; k < N; ++k){
+    if(datain[k] != backbuffer[k]) {
+      printf("bug\n");
+      return -1;
+    }
+  }
+  return 0;
+}
+```
+
+
+3) Frame-of-Reference 
+
+We also have frame-of-reference (FOR) functions (see simdfor.h header). They work like the bit packing
+routines, but do not use differential coding so they allow faster search in some cases, at the expense
+of compression.
+
+Setup
+---------
+
+
+make
+make test
+
+and if you are daring:
+
+make install
+
+Go
+--------
+
+If you are a go user, there is a "go" folder where you will find a simple demo.
+
+Other libraries
+----------------
+
+* Fast decoder for VByte-compressed integers https://github.com/lemire/MaskedVByte
+* Fast integer compression in C using StreamVByte https://github.com/lemire/streamvbyte
+* FastPFOR is a C++ research library well suited to compress unsorted arrays: https://github.com/lemire/FastPFor
+* SIMDCompressionAndIntersection is a C++ research library well suited for sorted arrays (differential coding)
+and computing intersections: https://github.com/lemire/SIMDCompressionAndIntersection
+* TurboPFor is a C library that offers lots of interesting optimizations. Well worth checking! (GPL license) https://github.com/powturbo/TurboPFor
+* Oroch is a C++ library that offers a usable API (MIT license) https://github.com/ademakov/Oroch
+
+
+References
+------------
+
+* Daniel Lemire, Leonid Boytsov, Nathan Kurz, SIMD Compression and the Intersection of Sorted Integers, Software Practice & Experience 46 (6) 2016. http://arxiv.org/abs/1401.6399
+* Daniel Lemire and Leonid Boytsov, Decoding billions of integers per second through vectorization, Software Practice & Experience 45 (1), 2015.  http://arxiv.org/abs/1209.2137 http://onlinelibrary.wiley.com/doi/10.1002/spe.2203/abstract
+* Jeff Plaisance, Nathan Kurz, Daniel Lemire, Vectorized VByte Decoding, International Symposium on Web Algorithms 2015, 2015. http://arxiv.org/abs/1503.07387
+* Wayne Xin Zhao, Xudong Zhang, Daniel Lemire, Dongdong Shan, Jian-Yun Nie, Hongfei Yan, Ji-Rong Wen, A General SIMD-based Approach to Accelerating Compression Algorithms, ACM Transactions on Information Systems 33 (3), 2015. http://arxiv.org/abs/1502.01916
+* T. D. Wu, Bitpacking techniques for indexing genomes: I. Hash tables, Algorithms for Molecular Biology 11 (5), 2016. http://almob.biomedcentral.com/articles/10.1186/s13015-016-0069-5
--- a/cpp/simdcomp/benchmarks/benchmark.c
+++ b/cpp/simdcomp/benchmarks/benchmark.c
@@ -0,0 +1,235 @@
+/**
+ * This code is released under a BSD License.
+ */
+#include <assert.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <time.h>
+
+#include "simdcomp.h"
+
+#ifdef _MSC_VER
+# include <windows.h>
+
+__int64 freq;
+
+typedef __int64 time_snap_t;
+
+static time_snap_t time_snap(void)
+{
+	__int64 now;
+
+	QueryPerformanceCounter((LARGE_INTEGER *)&now);
+
+	return (__int64)((now*1000000)/freq);
+}
+# define TIME_SNAP_FMT "%I64d"
+#else
+# define time_snap clock
+# define TIME_SNAP_FMT "%lu"
+typedef clock_t time_snap_t;
+#endif
+
+
+void benchmarkSelect() {
+    uint32_t buffer[128];
+    uint32_t backbuffer[128];
+    uint32_t initial = 33;
+    uint32_t b;
+    time_snap_t S1, S2, S3;
+    int i;
+    printf("benchmarking select \n");
+
+    /* this test creates delta encoded buffers with different bits, then
+     * performs lower bound searches for each key */
+    for (b = 0; b <= 32; b++) {
+        uint32_t prev = initial;
+        uint32_t out[128];
+        /* initialize the buffer */
+        for (i = 0; i < 128; i++) {
+            buffer[i] =  ((uint32_t)(1655765 * i )) ;
+            if(b < 32) buffer[i] %= (1<<b);
+        }
+        for (i = 0; i < 128; i++) {
+            buffer[i] = buffer[i] + prev;
+            prev = buffer[i];
+        }
+
+        for (i = 1; i < 128; i++) {
+            if(buffer[i] < buffer[i-1] )
+                buffer[i] = buffer[i-1];
+        }
+        assert(simdmaxbitsd1(initial, buffer)<=b);
+
+        for (i = 0; i < 128; i++) {
+            out[i] = 0; /* memset would do too */
+        }
+
+        /* delta-encode to 'i' bits */
+        simdpackwithoutmaskd1(initial, buffer, (__m128i *)out, b);
+
+        S1 = time_snap();
+        for (i = 0; i < 128 * 10; i++) {
+            uint32_t valretrieved = simdselectd1(initial, (__m128i *)out, b, (uint32_t)i % 128);
+            assert(valretrieved == buffer[i%128]);
+        }
+        S2 = time_snap();
+        for (i = 0; i < 128 * 10; i++) {
+            simdunpackd1(initial,  (__m128i *)out, backbuffer, b);
+            assert(backbuffer[i % 128] == buffer[i % 128]);
+        }
+        S3 = time_snap();
+        printf("bit width = %d, fast select function time = " TIME_SNAP_FMT ", naive time = " TIME_SNAP_FMT "  \n", b, (S2-S1), (S3-S2));
+    }
+}
+
+int uint32_cmp(const void *a, const void *b)
+{
+    const uint32_t *ia = (const uint32_t *)a;
+    const uint32_t *ib = (const uint32_t *)b;
+    if(*ia < *ib)
+        return -1;
+    else if (*ia > *ib)
+        return 1;
+    return 0;
+}
+
+/* adapted from wikipedia */
+int binary_search(uint32_t * A, uint32_t key, int imin, int imax)
+{
+    int imid;
+    imax --;
+    while(imin + 1 < imax) {
+        imid = imin + ((imax - imin) / 2);
+
+        if (A[imid] > key) {
+            imax = imid;
+        } else if (A[imid] < key) {
+            imin = imid;
+        } else {
+            return imid;
+        }
+    }
+    return imax;
+}
+
+
+/* adapted from wikipedia */
+int lower_bound(uint32_t * A, uint32_t key, int imin, int imax)
+{
+    int imid;
+    imax --;
+    while(imin + 1 < imax) {
+        imid = imin + ((imax - imin) / 2);
+
+        if (A[imid] >= key) {
+            imax = imid;
+        } else if (A[imid] < key) {
+            imin = imid;
+        }
+    }
+    if(A[imin] >= key) return imin;
+    return imax;
+}
+
+void benchmarkSearch() {
+    uint32_t buffer[128];
+    uint32_t backbuffer[128];
+    uint32_t out[128];
+    uint32_t result, initial = 0;
+    uint32_t b, i;
+    time_snap_t S1, S2, S3, S4;
+
+    printf("benchmarking search \n");
+
+    /* this test creates delta encoded buffers with different bits, then
+     * performs lower bound searches for each key */
+    for (b = 0; b <= 32; b++) {
+        uint32_t prev = initial;
+        /* initialize the buffer */
+        for (i = 0; i < 128; i++) {
+            buffer[i] =  ((uint32_t)rand()) ;
+            if(b < 32) buffer[i] %= (1<<b);
+        }
+
+        qsort(buffer,128, sizeof(uint32_t), uint32_cmp);
+
+        for (i = 0; i < 128; i++) {
+            buffer[i] = buffer[i] + prev;
+            prev = buffer[i];
+        }
+        for (i = 1; i < 128; i++) {
+            if(buffer[i] < buffer[i-1] )
+                buffer[i] = buffer[i-1];
+        }
+        assert(simdmaxbitsd1(initial, buffer)<=b);
+        for (i = 0; i < 128; i++) {
+            out[i] = 0; /* memset would do too */
+        }
+
+        /* delta-encode to 'i' bits */
+        simdpackwithoutmaskd1(initial, buffer, (__m128i *)out, b);
+        simdunpackd1(initial,  (__m128i *)out, backbuffer, b);
+
+        for (i = 0; i < 128; i++) {
+            assert(buffer[i] == backbuffer[i]);
+         }
+        S1 = time_snap();
+        for (i = 0; i < 128 * 10; i++) {
+
+            int pos;
+            uint32_t pseudorandomkey  =  buffer[i%128];
+            __m128i vecinitial = _mm_set1_epi32(initial);
+            pos = simdsearchd1(&vecinitial, (__m128i *)out, b,
+                               pseudorandomkey, &result);
+            if((result < pseudorandomkey) || (buffer[pos] != result)) {
+                printf("bug A.\n");
+            } else if (pos > 0) {
+                if(buffer[pos-1] >= pseudorandomkey)
+                    printf("bug B.\n");
+            }
+        }
+        S2 = time_snap();
+        for (i = 0; i < 128 * 10; i++) {
+            int pos;
+            uint32_t pseudorandomkey  =  buffer[i%128];
+            simdunpackd1(initial,  (__m128i *)out, backbuffer, b);
+            pos =  lower_bound(backbuffer, pseudorandomkey, 0, 128);
+            result = backbuffer[pos];
+
+            if((result < pseudorandomkey) || (buffer[pos] != result)) {
+                printf("bug C.\n");
+            } else if (pos > 0) {
+                if(buffer[pos-1] >= pseudorandomkey)
+                    printf("bug D.\n");
+            }
+        }
+        S3 = time_snap();
+        for (i = 0; i < 128 * 10; i++) {
+
+            int pos;
+            uint32_t pseudorandomkey  =  buffer[i%128];
+            pos = simdsearchwithlengthd1(initial, (__m128i *)out, b, 128,
+                               pseudorandomkey, &result);
+            if((result < pseudorandomkey) || (buffer[pos] != result)) {
+                printf("bug A.\n");
+            } else if (pos > 0) {
+                if(buffer[pos-1] >= pseudorandomkey)
+                    printf("bug B.\n");
+            }
+        }
+        S4 = time_snap();
+
+        printf("bit width = %d, fast search function time = " TIME_SNAP_FMT ", naive time = " TIME_SNAP_FMT " , fast with length time = " TIME_SNAP_FMT "  \n", b, (S2-S1), (S3-S2), (S4-S3) );
+    }
+}
+
+
+int main() {
+#ifdef _MSC_VER
+    QueryPerformanceFrequency((LARGE_INTEGER *)&freq);
+#endif
+    benchmarkSearch();
+    benchmarkSelect();
+    return 0;
+}
--- a/cpp/simdcomp/benchmarks/bitpackingbenchmark.c
+++ b/cpp/simdcomp/benchmarks/bitpackingbenchmark.c
@@ -0,0 +1,205 @@
+#include <stdio.h>
+
+#include "simdcomp.h"
+
+
+#define RDTSC_START(cycles)                                                   \
+    do {                                                                      \
+        register unsigned cyc_high, cyc_low;                                  \
+        __asm volatile(                                                       \
+            "cpuid\n\t"                                                       \
+            "rdtsc\n\t"                                                       \
+            "mov %%edx, %0\n\t"                                               \
+            "mov %%eax, %1\n\t"                                               \
+            : "=r"(cyc_high), "=r"(cyc_low)::"%rax", "%rbx", "%rcx", "%rdx"); \
+        (cycles) = ((uint64_t)cyc_high << 32) | cyc_low;                      \
+    } while (0)
+
+#define RDTSC_FINAL(cycles)                                                   \
+    do {                                                                      \
+        register unsigned cyc_high, cyc_low;                                  \
+        __asm volatile(                                                       \
+            "rdtscp\n\t"                                                      \
+            "mov %%edx, %0\n\t"                                               \
+            "mov %%eax, %1\n\t"                                               \
+            "cpuid\n\t"                                                       \
+            : "=r"(cyc_high), "=r"(cyc_low)::"%rax", "%rbx", "%rcx", "%rdx"); \
+        (cycles) = ((uint64_t)cyc_high << 32) | cyc_low;                      \
+    } while (0)
+
+
+
+
+uint32_t * get_random_array_from_bit_width(uint32_t length, uint32_t bit) {
+    uint32_t * answer = malloc(sizeof(uint32_t) * length);
+    uint32_t mask = (uint32_t) ((UINT64_C(1) << bit) - 1);
+    uint32_t i;
+    for(i = 0; i < length; ++i) {
+        answer[i] = rand() & mask;
+    }
+    return answer;
+}
+
+uint32_t * get_random_array_from_bit_width_d1(uint32_t length, uint32_t bit) {
+    uint32_t * answer = malloc(sizeof(uint32_t) * length);
+    uint32_t mask = (uint32_t) ((UINT64_C(1) << bit) - 1);
+    uint32_t i;
+    answer[0] = rand() & mask;
+    for(i = 1; i < length; ++i) {
+        answer[i] = answer[i-1] + (rand() & mask);
+    }
+    return answer;
+}
+
+
+void demo128() {
+    const uint32_t length = 128;
+    uint32_t bit;
+    printf("# --- %s\n", __func__);
+    printf("# compressing %d integers\n",length);
+    printf("# format: bit width, pack in cycles per int, unpack in cycles per int\n");
+    for(bit = 1; bit <= 32; ++bit) {
+        uint32_t i;
+
+        uint32_t * data = get_random_array_from_bit_width(length, bit);
+        __m128i * buffer = malloc(length * sizeof(uint32_t));
+        uint32_t * backdata = malloc(length * sizeof(uint32_t));
+        uint32_t repeat = 500;
+        uint64_t min_diff;
+        printf("%d\t",bit);
+        min_diff = (uint64_t)-1;
+        for (i = 0; i < repeat; i++) {
+            uint64_t cycles_start, cycles_final, cycles_diff;
+            __asm volatile("" ::: /* pretend to clobber */ "memory");
+            RDTSC_START(cycles_start);
+            simdpackwithoutmask(data,buffer, bit);
+            RDTSC_FINAL(cycles_final);
+            cycles_diff = (cycles_final - cycles_start);
+            if (cycles_diff < min_diff) min_diff = cycles_diff;
+        }
+        printf("%.2f\t",min_diff*1.0/length);
+        min_diff = (uint64_t)-1;
+        for (i = 0; i < repeat; i++) {
+            uint64_t cycles_start, cycles_final, cycles_diff;
+            __asm volatile("" ::: /* pretend to clobber */ "memory");
+            RDTSC_START(cycles_start);
+            simdunpack(buffer, backdata,bit);
+            RDTSC_FINAL(cycles_final);
+            cycles_diff = (cycles_final - cycles_start);
+            if (cycles_diff < min_diff) min_diff = cycles_diff;
+        }
+        printf("%.2f\t",min_diff*1.0/length);
+
+        free(data);
+        free(buffer);
+        free(backdata);
+        printf("\n");
+    }
+    printf("\n\n"); /* two blank lines are required by gnuplot */
+}
+
+void demo128_d1() {
+    const uint32_t length = 128;
+    uint32_t bit;
+    printf("# --- %s\n", __func__);
+    printf("# compressing %d integers\n",length);
+    printf("# format: bit width, pack in cycles per int, unpack in cycles per int\n");
+    for(bit = 1; bit <= 32; ++bit) {
+        uint32_t i;
+
+        uint32_t * data = get_random_array_from_bit_width_d1(length, bit);
+        __m128i * buffer = malloc(length * sizeof(uint32_t));
+        uint32_t * backdata = malloc(length * sizeof(uint32_t));
+        uint32_t repeat = 500;
+        uint64_t min_diff;
+        printf("%d\t",bit);
+        min_diff = (uint64_t)-1;
+        for (i = 0; i < repeat; i++) {
+            uint64_t cycles_start, cycles_final, cycles_diff;
+            __asm volatile("" ::: /* pretend to clobber */ "memory");
+            RDTSC_START(cycles_start);
+            simdpackwithoutmaskd1(0,data,buffer, bit);
+            RDTSC_FINAL(cycles_final);
+            cycles_diff = (cycles_final - cycles_start);
+            if (cycles_diff < min_diff) min_diff = cycles_diff;
+        }
+        printf("%.2f\t",min_diff*1.0/length);
+        min_diff = (uint64_t)-1;
+        for (i = 0; i < repeat; i++) {
+            uint64_t cycles_start, cycles_final, cycles_diff;
+            __asm volatile("" ::: /* pretend to clobber */ "memory");
+            RDTSC_START(cycles_start);
+            simdunpackd1(0,buffer, backdata,bit);
+            RDTSC_FINAL(cycles_final);
+            cycles_diff = (cycles_final - cycles_start);
+            if (cycles_diff < min_diff) min_diff = cycles_diff;
+        }
+        printf("%.2f\t",min_diff*1.0/length);
+
+        free(data);
+        free(buffer);
+        free(backdata);
+        printf("\n");
+    }
+    printf("\n\n"); /* two blank lines are required by gnuplot */
+}
+
+#ifdef __AVX2__
+void demo256() {
+    const uint32_t length = 256;
+    uint32_t bit;
+    printf("# --- %s\n", __func__);
+    printf("# compressing %d integers\n",length);
+    printf("# format: bit width, pack in cycles per int, unpack in cycles per int\n");
+    for(bit = 1; bit <= 32; ++bit) {
+        uint32_t i;
+
+        uint32_t * data = get_random_array_from_bit_width(length, bit);
+        __m256i * buffer = malloc(length * sizeof(uint32_t));
+        uint32_t * backdata = malloc(length * sizeof(uint32_t));
+        uint32_t repeat = 500;
+        uint64_t min_diff;
+        printf("%d\t",bit);
+        min_diff = (uint64_t)-1;
+        for (i = 0; i < repeat; i++) {
+            uint64_t cycles_start, cycles_final, cycles_diff;
+            __asm volatile("" ::: /* pretend to clobber */ "memory");
+            RDTSC_START(cycles_start);
+            avxpackwithoutmask(data,buffer, bit);
+            RDTSC_FINAL(cycles_final);
+            cycles_diff = (cycles_final - cycles_start);
+            if (cycles_diff < min_diff) min_diff = cycles_diff;
+        }
+        printf("%.2f\t",min_diff*1.0/length);
+        min_diff = (uint64_t)-1;
+        for (i = 0; i < repeat; i++) {
+            uint64_t cycles_start, cycles_final, cycles_diff;
+            __asm volatile("" ::: /* pretend to clobber */ "memory");
+            RDTSC_START(cycles_start);
+            avxunpack(buffer, backdata,bit);
+            RDTSC_FINAL(cycles_final);
+            cycles_diff = (cycles_final - cycles_start);
+            if (cycles_diff < min_diff) min_diff = cycles_diff;
+        }
+        printf("%.2f\t",min_diff*1.0/length);
+
+        free(data);
+        free(buffer);
+        free(backdata);
+        printf("\n");
+    }
+    printf("\n\n"); /* two blank lines are required by gnuplot */
+}
+#endif /* avx 2 */
+
+
+int main() {
+    demo128();
+    demo128_d1();
+#ifdef __AVX2__
+    demo256();
+#endif
+    return 0;
+
+
+}
--- a/cpp/simdcomp/example.c
+++ b/cpp/simdcomp/example.c
@@ -0,0 +1,195 @@
+/* Type "make example" to build this example program. */
+#include <stdio.h>
+#include <time.h>
+#include <stdlib.h>
+#include "simdcomp.h"
+
+/**
+We provide several different code examples.
+**/
+
+
+/* very simple test to illustrate a simple application */
+int compress_decompress_demo() {
+    size_t k, N = 9999;
+    __m128i * endofbuf;
+    int howmanybytes;
+    float compratio;
+    uint32_t * datain = malloc(N * sizeof(uint32_t));
+    uint8_t * buffer;
+    uint32_t * backbuffer = malloc(N * sizeof(uint32_t));
+    uint32_t b;
+    printf("== simple test\n");
+
+    for (k = 0; k < N; ++k) {       /* start with k=0, not k=1! */
+        datain[k] = k;
+    }
+
+    b = maxbits_length(datain, N);
+    buffer = malloc(simdpack_compressedbytes(N,b));
+    endofbuf = simdpack_length(datain, N, (__m128i *)buffer, b);
+    howmanybytes = (endofbuf-(__m128i *)buffer)*sizeof(__m128i); /* number of compressed bytes */
+    compratio = N*sizeof(uint32_t) * 1.0 / howmanybytes;
+    /* endofbuf points to the end of the compressed data */
+    buffer = realloc(buffer,(endofbuf-(__m128i *)buffer)*sizeof(__m128i)); /* optional but safe. */
+    printf("Compressed %d integers down to %d bytes (comp. ratio = %f).\n",(int)N,howmanybytes,compratio);
+    /* in actual applications b must be stored and retrieved: caller is responsible for that. */
+    simdunpack_length((const __m128i *)buffer, N, backbuffer, b); /* will return a pointer to endofbuf */ 
+
+    for (k = 0; k < N; ++k) {
+        if(datain[k] != backbuffer[k]) {
+            printf("bug at %lu \n",(unsigned long)k);
+            return -1;
+        }
+    }
+    printf("Code works!\n");
+    free(datain);
+    free(buffer);
+    free(backbuffer);
+    return 0;
+}
+
+
+
+/* compresses data from datain to buffer, returns how many bytes written
+used below in simple_demo */
+size_t compress(uint32_t * datain, size_t length, uint8_t * buffer) {
+    uint32_t offset;
+    uint8_t * initout;
+    size_t k;
+    if(length/SIMDBlockSize*SIMDBlockSize != length) {
+        printf("Data length should be a multiple of %i \n",SIMDBlockSize);
+    }
+    offset = 0;
+    initout = buffer;
+    for(k = 0; k < length / SIMDBlockSize; ++k) {
+        uint32_t b = simdmaxbitsd1(offset,
+                                   datain + k * SIMDBlockSize);
+        *buffer++ = b;
+        simdpackwithoutmaskd1(offset, datain + k * SIMDBlockSize, (__m128i *) buffer,
+                              b);
+        offset = datain[k * SIMDBlockSize + SIMDBlockSize - 1];
+        buffer += b * sizeof(__m128i);
+    }
+    return buffer - initout;
+}
+
+/* Another illustration ... */
+void simple_demo() {
+    size_t REPEAT = 10, gap;
+    size_t N = 1000 * SIMDBlockSize;/* SIMDBlockSize is 128 */
+    uint32_t * datain = malloc(N * sizeof(uint32_t));
+    size_t compsize;
+    clock_t start, end;
+    uint8_t * buffer = malloc(N * sizeof(uint32_t) + N / SIMDBlockSize); /* output buffer */
+    uint32_t * backbuffer = malloc(SIMDBlockSize * sizeof(uint32_t));
+    printf("== simple demo\n");
+    for (gap = 1; gap <= 243; gap *= 3) {
+        size_t k, repeat;
+        uint32_t offset = 0;
+        uint32_t bogus = 0;
+        double numberofseconds;
+
+        printf("\n");
+        printf(" gap = %lu \n", (unsigned long) gap);
+        datain[0] = 0;
+        for (k = 1; k < N; ++k)
+            datain[k] = datain[k-1] + ( rand() % (gap + 1) );
+        compsize = compress(datain,N,buffer);
+        printf("compression ratio = %f \n",  (N * sizeof(uint32_t))/ (compsize * 1.0 ));
+        start = clock();
+        for(repeat = 0; repeat < REPEAT; ++repeat) {
+            uint8_t * decbuffer = buffer;
+            for (k = 0; k * SIMDBlockSize < N; ++k) {
+                uint8_t b = *decbuffer++;
+                simdunpackd1(offset, (__m128i *) decbuffer, backbuffer, b);
+                /* do something here with backbuffer */
+                bogus += backbuffer[3];
+                decbuffer += b * sizeof(__m128i);
+                offset = backbuffer[SIMDBlockSize - 1];
+            }
+        }
+        end = clock();
+        numberofseconds = (end-start)/(double)CLOCKS_PER_SEC;
+        printf("decoding speed in million of integers per second %f \n",N*REPEAT/(numberofseconds*1000.0*1000.0));
+        start = clock();
+        for(repeat = 0; repeat < REPEAT; ++repeat) {
+            uint8_t * decbuffer = buffer;
+            for (k = 0; k * SIMDBlockSize < N; ++k) {
+                memcpy(backbuffer,decbuffer+k*SIMDBlockSize,SIMDBlockSize*sizeof(uint32_t));
+                bogus += backbuffer[3] - backbuffer[100];
+            }
+        }
+        end = clock();
+        numberofseconds = (end-start)/(double)CLOCKS_PER_SEC;
+        printf("memcpy speed in million of integers per second %f \n",N*REPEAT/(numberofseconds*1000.0*1000.0));
+        printf("ignore me %i \n",bogus);
+        printf("All tests are in CPU cache. Avoid out-of-cache decoding in applications.\n");
+    }
+    free(buffer);
+    free(datain);
+    free(backbuffer);
+}
+
+/* Used below in more_sophisticated_demo ... */
+size_t varying_bit_width_compress(uint32_t * datain, size_t length, uint8_t * buffer) {
+    uint8_t * initout;
+    size_t k;
+    if(length/SIMDBlockSize*SIMDBlockSize != length) {
+        printf("Data length should be a multiple of %i \n",SIMDBlockSize);
+    }
+    initout = buffer;
+    for(k = 0; k < length / SIMDBlockSize; ++k) {
+        uint32_t b = maxbits(datain);
+        *buffer++ = b;
+        simdpackwithoutmask(datain, (__m128i *)buffer, b);
+        datain += SIMDBlockSize;
+        buffer += b * sizeof(__m128i);
+    }
+    return buffer - initout;
+}
+
+/* Here we compress the data in blocks of 128 integers with varying bit width */
+int varying_bit_width_demo() {
+    size_t nn = 128 * 2;
+    uint32_t * datainn = malloc(nn * sizeof(uint32_t));
+    uint8_t * buffern = malloc(nn * sizeof(uint32_t) + nn / SIMDBlockSize);
+    uint8_t * initbuffern = buffern;
+    uint32_t * backbuffern = malloc(nn * sizeof(uint32_t));
+    size_t k, compsize;
+    printf("== varying bit-width demo\n");
+
+    for(k=0; k<nn; ++k) {
+        datainn[k] = rand() % (k + 1);
+    }
+
+    compsize = varying_bit_width_compress(datainn,nn,buffern);
+    printf("encoded size: %u (original size: %u)\n", (unsigned)compsize,
+           (unsigned)(nn * sizeof(uint32_t)));
+
+    for (k = 0; k * SIMDBlockSize < nn; ++k) {
+        uint32_t b = *buffern;
+        buffern++;
+        simdunpack((const __m128i *)buffern, backbuffern + k * SIMDBlockSize, b);
+        buffern += b * sizeof(__m128i);
+    }
+
+    for (k = 0; k < nn; ++k) {
+        if(backbuffern[k] != datainn[k]) {
+            printf("bug\n");
+            return -1;
+        }
+    }
+    printf("Code works!\n");
+    free(datainn);
+    free(initbuffern);
+    free(backbuffern);
+    return 0;
+}
+
+int main() {
+    if(compress_decompress_demo() != 0) return -1;
+    if(varying_bit_width_demo() != 0) return -1;
+    simple_demo();
+    return 0;
+}
--- a/cpp/simdcomp/go/README.md
+++ b/cpp/simdcomp/go/README.md
@@ -0,0 +1,13 @@
+Simple Go demo
+==============
+
+Setup
+======
+
+Start by installing the simdcomp library (make && make install).
+
+Then type:
+
+go run test.go
+
+
--- a/cpp/simdcomp/go/test.go
+++ b/cpp/simdcomp/go/test.go
@@ -0,0 +1,71 @@
+/////////
+// This particular file is in the public domain.
+// Author: Daniel Lemire
+////////
+
+package main 
+
+/*
+#cgo LDFLAGS: -lsimdcomp
+#include <simdcomp.h>
+*/
+import "C"
+import "fmt"
+
+//////////
+// For this demo, we pack and unpack blocks of 128 integers
+/////////
+func main() {
+        // I am going to use C types. Alternative might be to use unsafe.Pointer calls, see http://bit.ly/1ndw3W3
+        // this is our original data
+        var data [128]C.uint32_t
+        for i := C.uint32_t(0); i < C.uint32_t(128); i++ {
+            data[i] = i
+        }
+
+
+
+
+
+        ////////////
+        // We first pack without differential coding
+        ///////////
+        // computing how many bits per int. is needed
+        b  := C.maxbits(&data[0])
+        ratio := 32.0/float64(b)
+        fmt.Println("Bit width  ", b)
+        fmt.Println(fmt.Sprintf("Compression ratio %f ", ratio))
+         // we are now going to create a buffer to receive the packed data (each __m128i uses 128 bits)
+        out := make([] C.__m128i,b)       
+        C.simdpackwithoutmask( &data[0],&out[0],b);
+        var recovereddata [128]C.uint32_t
+        C.simdunpack(&out[0],&recovereddata[0],b)
+        for i := 0; i < 128; i++ {
+            if data[i] != recovereddata[i]  {
+                  fmt.Println("Bug ")
+                  return
+            }
+        } 
+
+        ///////////
+        // Next, we use differential coding
+        //////////
+        offset := C.uint32_t(0) // if you pack data from K to K + 128, offset should be the value at K-1. When K = 0, choose a default
+        b1  := C.simdmaxbitsd1(offset,&data[0])
+        ratio1 := 32.0/float64(b1)
+        fmt.Println("Bit width  ", b1)
+        fmt.Println(fmt.Sprintf("Compression ratio %f ", ratio1))
+         // we are now going to create a buffer to receive the packed data (each __m128i uses 128 bits)
+        out = make([] C.__m128i,b1)       
+        C.simdpackwithoutmaskd1(offset, &data[0],&out[0],b1);
+        C.simdunpackd1(offset,&out[0],&recovereddata[0],b1)
+        for i := 0; i < 128; i++ {
+            if data[i] != recovereddata[i]  {
+                  fmt.Println("Bug ")
+                  return
+            }
+        } 
+
+        fmt.Println("test succesful.")
+      
+}
--- a/cpp/simdcomp/include/avxbitpacking.h
+++ b/cpp/simdcomp/include/avxbitpacking.h
@@ -0,0 +1,40 @@
+/**
+ * This code is released under a BSD License.
+ */
+
+#ifndef INCLUDE_AVXBITPACKING_H_
+#define INCLUDE_AVXBITPACKING_H_
+
+
+#ifdef __AVX2__
+
+#include "portability.h"
+
+
+/* AVX2 is required */
+#include <immintrin.h>
+/* for memset */
+#include <string.h>
+
+#include "simdcomputil.h"
+
+enum{ AVXBlockSize = 256};
+
+/* max integer logarithm over a range of AVXBlockSize integers (256 integer) */
+uint32_t avxmaxbits(const uint32_t * begin);
+
+/* reads 256 values from "in", writes  "bit" 256-bit vectors to "out" */
+void avxpack(const uint32_t *  in,__m256i *  out, const uint32_t bit);
+
+/* reads 256 values from "in", writes  "bit" 256-bit vectors to "out" */
+void avxpackwithoutmask(const uint32_t *  in,__m256i *  out, const uint32_t bit);
+
+/* reads  "bit" 256-bit vectors from "in", writes  256 values to "out" */
+void avxunpack(const __m256i *  in,uint32_t *  out, const uint32_t bit);
+
+
+
+
+#endif /* __AVX2__ */
+
+#endif /* INCLUDE_AVXBITPACKING_H_ */
--- a/cpp/simdcomp/include/portability.h
+++ b/cpp/simdcomp/include/portability.h
@@ -0,0 +1,81 @@
+/**
+ * This code is released under a BSD License.
+ */
+#ifndef SIMDBITCOMPAT_H_
+#define SIMDBITCOMPAT_H_
+
+#include <iso646.h> /* mostly for Microsoft compilers */
+#include <string.h>
+
+#if SIMDCOMP_DEBUG
+# define SIMDCOMP_ALWAYS_INLINE inline
+# define SIMDCOMP_NEVER_INLINE
+# define SIMDCOMP_PURE
+#else
+# if defined(__GNUC__)
+#  if __GNUC__ >= 3
+#   define SIMDCOMP_ALWAYS_INLINE inline __attribute__((always_inline))
+#   define SIMDCOMP_NEVER_INLINE __attribute__((noinline))
+#   define SIMDCOMP_PURE __attribute__((pure))
+#  else
+#   define SIMDCOMP_ALWAYS_INLINE inline
+#   define SIMDCOMP_NEVER_INLINE
+#   define SIMDCOMP_PURE
+#  endif
+# elif defined(_MSC_VER)
+#  define SIMDCOMP_ALWAYS_INLINE __forceinline
+#  define SIMDCOMP_NEVER_INLINE
+#  define SIMDCOMP_PURE
+# else
+#  if __has_attribute(always_inline)
+#   define SIMDCOMP_ALWAYS_INLINE inline __attribute__((always_inline))
+#  else
+#   define SIMDCOMP_ALWAYS_INLINE inline
+#  endif
+#  if __has_attribute(noinline)
+#   define SIMDCOMP_NEVER_INLINE __attribute__((noinline))
+#  else
+#   define SIMDCOMP_NEVER_INLINE
+#  endif
+#  if __has_attribute(pure)
+#   define SIMDCOMP_PURE __attribute__((pure))
+#  else
+#   define SIMDCOMP_PURE
+#  endif
+# endif
+#endif
+
+#if defined(_MSC_VER) && _MSC_VER < 1600
+typedef unsigned int uint32_t;
+typedef unsigned char uint8_t;
+typedef signed char int8_t;
+#else
+#include <stdint.h> /* part of Visual Studio 2010 and better, others likely anyway */
+#endif
+
+#if defined(_MSC_VER)
+#define SIMDCOMP_ALIGNED(x) __declspec(align(x))
+#else
+#if defined(__GNUC__)
+#define SIMDCOMP_ALIGNED(x) __attribute__ ((aligned(x)))
+#endif
+#endif
+
+#if defined(_MSC_VER)
+# include <intrin.h>
+/* 64-bit needs extending */
+# define SIMDCOMP_CTZ(result, mask) do { \
+		unsigned long index; \
+		if (!_BitScanForward(&(index), (mask))) { \
+			(result) = 32U; \
+		} else { \
+			(result) = (uint32_t)(index); \
+		} \
+	} while (0)
+#else
+# define SIMDCOMP_CTZ(result, mask) \
+	result = __builtin_ctz(mask)
+#endif
+
+#endif /* SIMDBITCOMPAT_H_ */
+
--- a/cpp/simdcomp/include/simdbitpacking.h
+++ b/cpp/simdcomp/include/simdbitpacking.h
@@ -0,0 +1,72 @@
+/**
+ * This code is released under a BSD License.
+ */
+#ifndef SIMDBITPACKING_H_
+#define SIMDBITPACKING_H_
+
+#include "portability.h"
+
+/* SSE2 is required */
+#include <emmintrin.h>
+/* for memset */
+#include <string.h>
+
+#include "simdcomputil.h"
+
+/***
+* Please see example.c for various examples on how to make good use
+* of these functions.
+*/
+
+
+
+/* reads 128 values from "in", writes  "bit" 128-bit vectors to "out".
+ * The input values are masked so that only the least significant "bit" bits are used. */
+void simdpack(const uint32_t *  in,__m128i *  out, const uint32_t bit);
+
+/* reads 128 values from "in", writes  "bit" 128-bit vectors to "out".
+ * The input values are assumed to be less than 1<<bit. */
+void simdpackwithoutmask(const uint32_t *  in,__m128i *  out, const uint32_t bit);
+
+/* reads  "bit" 128-bit vectors from "in", writes  128 values to "out" */
+void simdunpack(const __m128i *  in,uint32_t *  out, const uint32_t bit);
+
+
+
+/* how many compressed bytes are needed to compressed length integers using a bit width of bit with 
+the  simdpackFOR_length function. */
+int simdpack_compressedbytes(int length, const uint32_t bit);
+
+/* like simdpack, but supports an undetermined number of inputs.
+ * This is useful if you need to unpack an array of integers that is not divisible by 128 integers.
+ * Returns a pointer to the (advanced) compressed array. Compressed data is stored in the memory location between 
+ the provided (out) pointer and the returned pointer. */
+__m128i * simdpack_length(const uint32_t *   in, size_t length, __m128i *    out, const uint32_t bit);
+
+/* like simdunpack, but supports an undetermined number of inputs.
+ * This is useful if you need to unpack an array of integers that is not divisible by 128 integers.
+ * Returns a pointer to the (advanced) compressed array. The read compressed data is between the provided 
+ (in) pointer and the returned pointer. */
+const __m128i * simdunpack_length(const __m128i *   in, size_t length, uint32_t * out, const uint32_t bit);
+
+
+
+
+/* like simdpack, but supports an undetermined small number of inputs. This is useful if you need to pack less 
+than 128 integers.
+ * Note that this function is much slower.
+ * Returns a pointer to the (advanced) compressed array. Compressed data is stored in the memory location 
+ between the provided (out) pointer and the returned pointer. */
+__m128i * simdpack_shortlength(const uint32_t *   in, int length, __m128i *    out, const uint32_t bit);
+
+/* like simdunpack, but supports an undetermined small number of inputs. This is useful if you need to unpack less
+ than 128 integers.
+ * Note that this function is much slower.
+ * Returns a pointer to the (advanced) compressed array. The read compressed data is between the provided (in) 
+ pointer and the returned pointer. */
+const __m128i * simdunpack_shortlength(const __m128i *   in, int length, uint32_t * out, const uint32_t bit);
+
+/* given a block of 128 packed values, this function sets the value at index "index" to "value" */
+void simdfastset(__m128i * in128, uint32_t b, uint32_t value, size_t index);
+
+#endif /* SIMDBITPACKING_H_ */
--- a/cpp/simdcomp/include/simdcomp.h
+++ b/cpp/simdcomp/include/simdcomp.h
@@ -0,0 +1,22 @@
+/**
+ * This code is released under a BSD License.
+ */
+
+#ifndef SIMDCOMP_H_
+#define SIMDCOMP_H_
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include "simdbitpacking.h"
+#include "simdcomputil.h"
+#include "simdfor.h"
+#include "simdintegratedbitpacking.h"
+#include "avxbitpacking.h"
+
+#ifdef __cplusplus
+} // extern "C"
+#endif
+
+#endif 
--- a/cpp/simdcomp/include/simdcomputil.h
+++ b/cpp/simdcomp/include/simdcomputil.h
@@ -0,0 +1,54 @@
+/**
+ * This code is released under a BSD License.
+ */
+
+#ifndef SIMDCOMPUTIL_H_
+#define SIMDCOMPUTIL_H_
+
+#include "portability.h"
+
+/* SSE2 is required */
+#include <emmintrin.h>
+
+
+
+
+/* returns the integer logarithm of v (bit width) */
+uint32_t bits(const uint32_t v);
+
+/* max integer logarithm over a range of SIMDBlockSize integers (128 integer) */
+uint32_t maxbits(const uint32_t * begin);
+
+/* same as maxbits, but we specify the number of integers */
+uint32_t maxbits_length(const uint32_t * in,uint32_t length);
+
+enum{ SIMDBlockSize = 128};
+
+
+/* computes (quickly) the minimal value of 128 values */
+uint32_t simdmin(const uint32_t * in);
+
+/* computes (quickly) the minimal value of the specified number of values */
+uint32_t simdmin_length(const uint32_t * in, uint32_t length);
+
+#ifdef __SSE4_1__
+/* computes (quickly) the minimal and maximal value of the specified number of values */
+void simdmaxmin_length(const uint32_t * in, uint32_t length, uint32_t * getmin, uint32_t * getmax);
+
+/* computes (quickly) the minimal and maximal value of the 128 values */
+void simdmaxmin(const uint32_t * in, uint32_t * getmin, uint32_t * getmax);
+
+#endif
+
+/* like maxbit over 128 integers (SIMDBlockSize) with provided initial value 
+   and using differential coding */
+uint32_t simdmaxbitsd1(uint32_t initvalue, const uint32_t * in);
+
+/* like simdmaxbitsd1, but calculates maxbits over |length| integers 
+   with provided initial value. |length| can be any arbitrary value. */
+uint32_t simdmaxbitsd1_length(uint32_t initvalue, const uint32_t * in,
+                uint32_t length);
+
+
+
+#endif /* SIMDCOMPUTIL_H_ */
--- a/cpp/simdcomp/include/simdfor.h
+++ b/cpp/simdcomp/include/simdfor.h
@@ -0,0 +1,72 @@
+/**
+ * This code is released under a BSD License.
+ */
+#ifndef INCLUDE_SIMDFOR_H_
+#define INCLUDE_SIMDFOR_H_
+
+#include "portability.h"
+
+/* SSE2 is required */
+#include <emmintrin.h>
+
+#include "simdcomputil.h"
+#include "simdbitpacking.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/* reads 128 values from "in", writes  "bit" 128-bit vectors to "out" */
+void simdpackFOR(uint32_t initvalue, const uint32_t *  in,__m128i *  out, const uint32_t bit);
+
+
+/* reads "bit" 128-bit vectors from "in", writes  128 values to "out" */
+void simdunpackFOR(uint32_t initvalue, const __m128i *  in,uint32_t *  out, const uint32_t bit);
+
+
+/* how many compressed bytes are needed to compressed length integers using a bit width of bit with 
+the  simdpackFOR_length function. */
+int simdpackFOR_compressedbytes(int length, const uint32_t bit);
+
+/* like simdpackFOR, but supports an undetermined number of inputs. 
+This is useful if you need to pack less than 128 integers. Note that this function is much slower. 
+ Compressed data is stored in the memory location between 
+ the provided (out) pointer and the returned pointer. */
+__m128i * simdpackFOR_length(uint32_t initvalue, const uint32_t *   in, int length, __m128i *    out, const uint32_t bit);
+
+/* like simdunpackFOR, but supports an undetermined number of inputs. 
+This is useful if you need to unpack less than 128 integers. Note that this function is much slower. 
+ The read compressed data is between the provided 
+ (in) pointer and the returned pointer.  */
+const __m128i * simdunpackFOR_length(uint32_t initvalue, const __m128i *   in, int length, uint32_t * out, const uint32_t bit);
+
+
+/* returns the value stored at the specified "slot".
+* */
+uint32_t simdselectFOR(uint32_t initvalue, const __m128i *in, uint32_t bit,
+                int slot);
+
+/* given a block of 128 packed values, this function sets the value at index "index" to "value" */
+void simdfastsetFOR(uint32_t initvalue, __m128i * in, uint32_t bit, uint32_t value, size_t index);
+
+
+/* searches "bit" 128-bit vectors from "in" (= length<=128 encoded integers) for the first encoded uint32 value
+ * which is >= |key|, and returns its position. It is assumed that the values
+ * stored are in sorted order.
+ * The encoded key is stored in "*presult".
+ * The first length decoded integers, ignoring others. If no value is larger or equal to the key,
+ * length is returned. Length should be no larger than 128.
+ *
+ * If no value is larger or equal to the key,
+* length is returned */
+int simdsearchwithlengthFOR(uint32_t initvalue, const __m128i *in, uint32_t bit,
+                int length, uint32_t key, uint32_t *presult);
+
+#ifdef __cplusplus
+} // extern "C"
+#endif
+
+
+
+
+#endif /* INCLUDE_SIMDFOR_H_ */
--- a/cpp/simdcomp/include/simdintegratedbitpacking.h
+++ b/cpp/simdcomp/include/simdintegratedbitpacking.h
@@ -0,0 +1,98 @@
+/**
+ * This code is released under a BSD License.
+ */
+
+#ifndef SIMD_INTEGRATED_BITPACKING_H
+#define SIMD_INTEGRATED_BITPACKING_H
+
+#include "portability.h"
+
+/* SSE2 is required */
+#include <emmintrin.h>
+
+#include "simdcomputil.h"
+#include "simdbitpacking.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/* reads 128 values from "in", writes  "bit" 128-bit vectors to "out"
+   integer values should be in sorted order (for best results).
+   The differences are masked so that only the least significant "bit" bits are used. */
+void simdpackd1(uint32_t initvalue, const uint32_t *  in,__m128i *  out, const uint32_t bit);
+
+
+/* reads 128 values from "in", writes  "bit" 128-bit vectors to "out"
+   integer values should be in sorted order (for best results).
+   The difference values are assumed to be less than 1<<bit. */
+void simdpackwithoutmaskd1(uint32_t initvalue, const uint32_t *  in,__m128i *  out, const uint32_t bit);
+
+
+/* reads "bit" 128-bit vectors from "in", writes  128 values to "out" */
+void simdunpackd1(uint32_t initvalue, const __m128i *  in,uint32_t *  out, const uint32_t bit);
+
+
+/* searches "bit" 128-bit vectors from "in" (= 128 encoded integers) for the first encoded uint32 value
+ * which is >= |key|, and returns its position. It is assumed that the values
+ * stored are in sorted order.
+ * The encoded key is stored in "*presult". If no value is larger or equal to the key,
+* 128 is returned. The pointer initOffset is a pointer to the last four value decoded
+* (when starting out, this can be a zero vector or initialized with _mm_set1_epi32(init)),
+* and the vector gets updated.
+**/
+int
+simdsearchd1(__m128i * initOffset, const __m128i *in, uint32_t bit,
+                uint32_t key, uint32_t *presult);
+
+
+/* searches "bit" 128-bit vectors from "in" (= length<=128 encoded integers) for the first encoded uint32 value
+ * which is >= |key|, and returns its position. It is assumed that the values
+ * stored are in sorted order.
+ * The encoded key is stored in "*presult".
+ * The first length decoded integers, ignoring others. If no value is larger or equal to the key,
+ * length is returned. Length should be no larger than 128.
+ *
+ * If no value is larger or equal to the key,
+* length is returned */
+int simdsearchwithlengthd1(uint32_t initvalue, const __m128i *in, uint32_t bit,
+                int length, uint32_t key, uint32_t *presult);
+
+
+
+/* returns the value stored at the specified "slot".
+* */
+uint32_t simdselectd1(uint32_t initvalue, const __m128i *in, uint32_t bit,
+                int slot);
+
+/* given a block of 128 packed values, this function sets the value at index "index" to "value",
+ * you must somehow know the previous value.
+ * Because of differential coding, all following values are incremented by the offset between this new
+ * value and the old value... 
+ * This functions is useful if you want to modify the last value. 
+ */
+void simdfastsetd1fromprevious( __m128i * in, uint32_t bit, uint32_t previousvalue, uint32_t value, size_t index);
+
+/* given a block of 128 packed values, this function sets the value at index "index" to "value",
+ * This function computes the previous value if needed.
+ * Because of differential coding, all following values are incremented by the offset between this new
+ * value and the old value...
+ * This functions is useful if you want to modify the last value. 
+ */
+void simdfastsetd1(uint32_t initvalue, __m128i * in, uint32_t bit, uint32_t value, size_t index);
+
+
+/*Simply scan the data
+* The pointer initOffset is a pointer to the last four value decoded
+* (when starting out, this can be a zero vector or initialized with _mm_set1_epi32(init);),
+* and the vector gets updated.
+* */
+
+void
+simdscand1(__m128i * initOffset, const __m128i *in, uint32_t bit);
+
+#ifdef __cplusplus
+} // extern "C"
+#endif
+
+#endif
--- a/cpp/simdcomp/makefile
+++ b/cpp/simdcomp/makefile
@@ -0,0 +1,79 @@
+# minimalist makefile
+.SUFFIXES:
+#
+.SUFFIXES: .cpp .o .c .h
+ifeq ($(DEBUG),1)
+CFLAGS = -fPIC  -std=c89 -ggdb -msse4.1 -march=native -Wall -Wextra -Wshadow -fsanitize=undefined  -fno-omit-frame-pointer -fsanitize=address
+else
+CFLAGS = -fPIC -std=c89 -O3 -msse4.1  -march=native -Wall -Wextra -Wshadow
+endif # debug
+LDFLAGS = -shared
+LIBNAME=libsimdcomp.so.0.0.3
+all:  unit unit_chars bitpackingbenchmark $(LIBNAME)
+test:
+	./unit
+	./unit_chars
+install: $(OBJECTS)
+	cp $(LIBNAME) /usr/local/lib
+	ln -s /usr/local/lib/$(LIBNAME) /usr/local/lib/libsimdcomp.so
+	ldconfig
+	cp $(HEADERS) /usr/local/include
+
+
+
+HEADERS=./include/simdbitpacking.h ./include/simdcomputil.h ./include/simdintegratedbitpacking.h ./include/simdcomp.h ./include/simdfor.h ./include/avxbitpacking.h
+
+uninstall:
+	for h in $(HEADERS) ; do rm  /usr/local/$$h; done
+	rm  /usr/local/lib/$(LIBNAME)
+	rm /usr/local/lib/libsimdcomp.so
+	ldconfig
+
+
+OBJECTS= simdbitpacking.o simdintegratedbitpacking.o simdcomputil.o \
+		 simdpackedsearch.o simdpackedselect.o simdfor.o avxbitpacking.o
+
+$(LIBNAME): $(OBJECTS)
+	$(CC) $(CFLAGS) -o $(LIBNAME) $(OBJECTS)  $(LDFLAGS)
+
+
+avxbitpacking.o: ./src/avxbitpacking.c $(HEADERS)
+	$(CC) $(CFLAGS) -c ./src/avxbitpacking.c -Iinclude
+
+
+simdfor.o: ./src/simdfor.c $(HEADERS)
+	$(CC) $(CFLAGS) -c ./src/simdfor.c -Iinclude
+
+
+simdcomputil.o: ./src/simdcomputil.c $(HEADERS)
+	$(CC) $(CFLAGS) -c ./src/simdcomputil.c -Iinclude
+
+simdbitpacking.o: ./src/simdbitpacking.c $(HEADERS)
+	$(CC) $(CFLAGS) -c ./src/simdbitpacking.c -Iinclude
+
+simdintegratedbitpacking.o: ./src/simdintegratedbitpacking.c  $(HEADERS)
+	$(CC) $(CFLAGS) -c ./src/simdintegratedbitpacking.c -Iinclude
+
+simdpackedsearch.o: ./src/simdpackedsearch.c $(HEADERS)
+	$(CC) $(CFLAGS) -c ./src/simdpackedsearch.c -Iinclude
+
+simdpackedselect.o: ./src/simdpackedselect.c $(HEADERS)
+	$(CC) $(CFLAGS) -c ./src/simdpackedselect.c -Iinclude
+
+example: ./example.c    $(HEADERS) $(OBJECTS)
+	$(CC) $(CFLAGS) -o example ./example.c -Iinclude  $(OBJECTS)
+
+unit: ./tests/unit.c    $(HEADERS) $(OBJECTS)
+	$(CC) $(CFLAGS) -o unit ./tests/unit.c -Iinclude  $(OBJECTS)
+
+bitpackingbenchmark: ./benchmarks/bitpackingbenchmark.c    $(HEADERS) $(OBJECTS)
+	$(CC) $(CFLAGS) -o bitpackingbenchmark ./benchmarks/bitpackingbenchmark.c -Iinclude  $(OBJECTS)
+benchmark: ./benchmarks/benchmark.c    $(HEADERS) $(OBJECTS)
+	$(CC) $(CFLAGS) -o benchmark ./benchmarks/benchmark.c -Iinclude  $(OBJECTS)
+dynunit: ./tests/unit.c    $(HEADERS) $(LIBNAME)
+	$(CC) $(CFLAGS) -o dynunit ./tests/unit.c -Iinclude  -lsimdcomp
+
+unit_chars: ./tests/unit_chars.c    $(HEADERS) $(OBJECTS)
+	$(CC) $(CFLAGS) -o unit_chars ./tests/unit_chars.c -Iinclude  $(OBJECTS)
+clean:
+	rm -f unit *.o $(LIBNAME) example benchmark bitpackingbenchmark dynunit unit_chars
--- a/cpp/simdcomp/makefile.vc
+++ b/cpp/simdcomp/makefile.vc
@@ -0,0 +1,104 @@
+
+!IFNDEF MACHINE
+!IF "$(PROCESSOR_ARCHITECTURE)"=="AMD64"
+MACHINE=x64
+!ELSE
+MACHINE=x86
+!ENDIF
+!ENDIF
+
+!IFNDEF DEBUG
+DEBUG=no
+!ENDIF
+
+!IFNDEF CC
+CC=cl.exe
+!ENDIF
+
+!IFNDEF AR
+AR=lib.exe
+!ENDIF
+
+!IFNDEF LINK
+LINK=link.exe
+!ENDIF
+
+!IFNDEF PGO
+PGO=no
+!ENDIF
+
+!IFNDEF PGI
+PGI=no
+!ENDIF
+
+INC = /Iinclude
+
+!IF "$(DEBUG)"=="yes"
+CFLAGS = /nologo /MDd /LDd /Od /Zi /D_DEBUG /RTC1 /W3 /GS /Gm
+ARFLAGS = /nologo
+LDFLAGS = /nologo /debug /nodefaultlib:msvcrt
+!ELSE
+CFLAGS = /nologo /MD /O2 /Zi /DNDEBUG /W3 /Gm- /GS /Gy /Oi /GL /MP
+ARFLAGS = /nologo /LTCG
+LDFLAGS = /nologo /LTCG /DYNAMICBASE /incremental:no /debug /opt:ref,icf
+!ENDIF
+
+!IF "$(PGI)"=="yes"
+LDFLAGS = $(LDFLAGS) /ltcg:pgi
+!ENDIF
+
+!IF "$(PGO)"=="yes"
+LDFLAGS = $(LDFLAGS) /ltcg:pgo
+!ENDIF
+
+LIB_OBJS = simdbitpacking.obj simdintegratedbitpacking.obj simdcomputil.obj \
+	simdpackedsearch.obj simdpackedselect.obj simdfor.obj
+
+
+all: lib dll dynunit unit_chars example benchmark
+# need some good use case scenario to train the instrumented build
+	@if "$(PGI)"=="yes" echo Running PGO training
+	@if "$(PGI)"=="yes" benchmark.exe >nul 2>&1
+	@if "$(PGI)"=="yes" example.exe >nul 2>&1
+
+
+$(LIB_OBJS):
+	$(CC) $(INC) $(CFLAGS) /c src/simdbitpacking.c src/simdintegratedbitpacking.c src/simdcomputil.c \
+		src/simdpackedsearch.c src/simdpackedselect.c src/simdfor.c
+
+lib: $(LIB_OBJS)
+	$(AR) $(ARFLAGS) /OUT:simdcomp_a.lib $(LIB_OBJS)
+
+dll: $(LIB_OBJS)
+	$(LINK) /DLL $(LDFLAGS) /OUT:simdcomp.dll /IMPLIB:simdcomp.lib /DEF:simdcomp.def $(LIB_OBJS)
+
+unit: lib
+	$(CC) $(INC) $(CFLAGS) /c src/unit.c 
+	$(LINK) $(LDFLAGS) /OUT:unit.exe unit.obj simdcomp_a.lib
+
+dynunit: dll
+	$(CC) $(INC) $(CFLAGS) /c src/unit.c 
+	$(LINK) $(LDFLAGS) /OUT:unit.exe unit.obj simdcomp.lib
+
+unit_chars: lib
+	$(CC) $(INC) $(CFLAGS) /c src/unit_chars.c
+	$(LINK) $(LDFLAGS) /OUT:unit_chars.exe unit_chars.obj simdcomp.lib
+
+
+example: lib
+	$(CC) $(INC) $(CFLAGS) /c example.c
+	$(LINK) $(LDFLAGS) /OUT:example.exe example.obj simdcomp.lib
+
+benchmark: lib
+	$(CC) $(INC) $(CFLAGS) /c src/benchmark.c
+	$(LINK) $(LDFLAGS) /OUT:benchmark.exe benchmark.obj simdcomp.lib
+
+clean:
+	del /Q *.obj
+	del /Q *.lib
+	del /Q *.exe
+	del /Q *.dll
+	del /Q *.pgc
+	del /Q *.pgd
+	del /Q *.pdb
+
--- a/cpp/simdcomp/package.json
+++ b/cpp/simdcomp/package.json
@@ -0,0 +1,16 @@
+{
+  "name": "simdcomp",
+  "version": "0.0.3",
+  "repo": "lemire/simdcomp",
+  "description": "A simple C library for compressing lists of integers",
+  "license": "BSD-3-Clause",
+  "src": [
+    "src/simdbitpacking.c",
+    "src/simdcomputil.c",
+    "src/simdintegratedbitpacking.c",
+    "include/simdbitpacking.h",
+    "include/simdcomp.h",
+    "include/simdcomputil.h",
+    "include/simdintegratedbitpacking.h"
+  ]
+}
--- a/cpp/simdcomp/scripts/avxpacking.py
+++ b/cpp/simdcomp/scripts/avxpacking.py
@@ -0,0 +1,182 @@
+#!/usr/bin/env python
+import sys
+def howmany(bit):
+    """ how many values are we going to pack? """
+    return 256
+
+def howmanywords(bit):
+    return (howmany(bit) * bit + 255)/256
+
+def howmanybytes(bit):
+    return howmanywords(bit) * 16
+
+print("""
+/** code generated by avxpacking.py starts here **/
+""")
+
+print("""typedef void (*avxpackblockfnc)(const uint32_t * pin, __m256i * compressed);""")
+print("""typedef void (*avxunpackblockfnc)(const __m256i * compressed, uint32_t * pout);""")
+
+
+
+
+
+
+def plurial(number):
+    if(number <> 1):
+        return "s"
+    else :
+        return ""
+
+print("")
+print("static void avxpackblock0(const uint32_t * pin, __m256i * compressed) {");
+print("  (void)compressed;");
+print("  (void) pin; /* we consumed {0} 32-bit integer{1} */ ".format(howmany(0),plurial(howmany(0))));
+print("}");
+print("")
+
+for bit in range(1,33):
+    print("")
+    print("/* we are going to pack {0} {1}-bit values, touching {2} 256-bit words, using {3} bytes */ ".format(howmany(bit),bit,howmanywords(bit),howmanybytes(bit)))
+    print("static void avxpackblock{0}(const uint32_t * pin, __m256i * compressed) {{".format(bit));
+    print("  const __m256i * in = (const __m256i *)  pin;");
+    print("  /* we are going to touch  {0} 256-bit word{1} */ ".format(howmanywords(bit),plurial(howmanywords(bit))));
+    if(howmanywords(bit) == 1):
+      print("  __m256i w0;")
+    else:
+      print("  __m256i w0, w1;")
+    if( (bit & (bit-1)) <> 0) : print("  __m256i tmp; /* used to store inputs at word boundary */")
+    oldword = 0
+    for j in range(howmany(bit)/8):
+      firstword = j * bit / 32
+      if(firstword > oldword):
+        print("  _mm256_storeu_si256(compressed + {0}, w{1});".format(oldword,oldword%2))
+        oldword = firstword
+      secondword = (j * bit + bit - 1)/32
+      firstshift = (j*bit) % 32
+      if( firstword == secondword):
+          if(firstshift == 0):
+            print("  w{0} = _mm256_lddqu_si256 (in + {1});".format(firstword%2,j))
+          else:
+            print("  w{0} = _mm256_or_si256(w{0},_mm256_slli_epi32(_mm256_lddqu_si256 (in + {1}) , {2}));".format(firstword%2,j,firstshift))
+      else:
+          print("  tmp = _mm256_lddqu_si256 (in + {0});".format(j))
+          print("  w{0} = _mm256_or_si256(w{0},_mm256_slli_epi32(tmp , {2}));".format(firstword%2,j,firstshift))
+          secondshift = 32-firstshift
+          print("  w{0} = _mm256_srli_epi32(tmp,{2});".format(secondword%2,j,secondshift))
+    print("  _mm256_storeu_si256(compressed + {0}, w{1});".format(secondword,secondword%2))
+    print("}");
+    print("")
+
+
+print("")
+print("static void avxpackblockmask0(const uint32_t * pin, __m256i * compressed) {");
+print("  (void)compressed;");
+print("  (void) pin; /* we consumed {0} 32-bit integer{1} */ ".format(howmany(0),plurial(howmany(0))));
+print("}");
+print("")
+
+for bit in range(1,33):
+    print("")
+    print("/* we are going to pack {0} {1}-bit values, touching {2} 256-bit words, using {3} bytes */ ".format(howmany(bit),bit,howmanywords(bit),howmanybytes(bit)))
+    print("static void avxpackblockmask{0}(const uint32_t * pin, __m256i * compressed) {{".format(bit));
+    print("  /* we are going to touch  {0} 256-bit word{1} */ ".format(howmanywords(bit),plurial(howmanywords(bit))));
+    if(howmanywords(bit) == 1):
+      print("  __m256i w0;")
+    else:
+      print("  __m256i w0, w1;")
+    print("  const __m256i * in = (const __m256i *) pin;");
+    if(bit < 32): print("  const __m256i mask = _mm256_set1_epi32({0});".format((1<<bit)-1));
+    def maskfnc(x):
+        if(bit == 32): return x
+        return " _mm256_and_si256 ( mask, {0}) ".format(x)
+    if( (bit & (bit-1)) <> 0) : print("  __m256i tmp; /* used to store inputs at word boundary */")
+    oldword = 0
+    for j in range(howmany(bit)/8):
+      firstword = j * bit / 32
+      if(firstword > oldword):
+        print("  _mm256_storeu_si256(compressed + {0}, w{1});".format(oldword,oldword%2))
+        oldword = firstword
+      secondword = (j * bit + bit - 1)/32
+      firstshift = (j*bit) % 32
+      loadstr = maskfnc(" _mm256_lddqu_si256 (in + {0}) ".format(j))
+      if( firstword == secondword):
+          if(firstshift == 0):
+            print("  w{0} = {1};".format(firstword%2,loadstr))
+          else:
+            print("  w{0} = _mm256_or_si256(w{0},_mm256_slli_epi32({1} , {2}));".format(firstword%2,loadstr,firstshift))
+      else:
+          print("  tmp = {0};".format(loadstr))
+          print("  w{0} = _mm256_or_si256(w{0},_mm256_slli_epi32(tmp , {2}));".format(firstword%2,j,firstshift))
+          secondshift = 32-firstshift
+          print("  w{0} = _mm256_srli_epi32(tmp,{2});".format(secondword%2,j,secondshift))
+    print("  _mm256_storeu_si256(compressed + {0}, w{1});".format(secondword,secondword%2))
+    print("}");
+    print("")
+
+
+print("static void avxunpackblock0(const __m256i * compressed, uint32_t * pout) {");
+print("  (void) compressed;");
+print("  memset(pout,0,{0});".format(howmany(0)));
+print("}");
+print("")
+
+for bit in range(1,33):
+    print("")
+    print("/* we packed {0} {1}-bit values, touching {2} 256-bit words, using {3} bytes */ ".format(howmany(bit),bit,howmanywords(bit),howmanybytes(bit)))
+    print("static void avxunpackblock{0}(const __m256i * compressed, uint32_t * pout) {{".format(bit));
+    print("  /* we are going to access  {0} 256-bit word{1} */ ".format(howmanywords(bit),plurial(howmanywords(bit))));
+    if(howmanywords(bit) == 1):
+      print("  __m256i w0;")
+    else:
+      print("  __m256i w0, w1;")
+    print("  __m256i * out = (__m256i *) pout;");
+    if(bit < 32): print("  const __m256i mask = _mm256_set1_epi32({0});".format((1<<bit)-1));
+    maskstr = " _mm256_and_si256 ( mask, {0}) "
+    if (bit == 32) : maskstr = " {0} " # no need
+    oldword = 0
+    print("  w0 = _mm256_lddqu_si256 (compressed);")
+    for j in range(howmany(bit)/8):
+      firstword = j * bit / 32
+      secondword = (j * bit + bit - 1)/32
+      if(secondword > oldword):
+        print("  w{0} = _mm256_lddqu_si256 (compressed + {1});".format(secondword%2,secondword))
+        oldword = secondword
+      firstshift = (j*bit) % 32
+      firstshiftstr = "_mm256_srli_epi32( w{0} , "+str(firstshift)+") "
+      if(firstshift == 0):
+          firstshiftstr =" w{0} " # no need
+      wfirst = firstshiftstr.format(firstword%2)
+      if( firstword == secondword):
+          if(firstshift + bit <> 32):
+            wfirst  = maskstr.format(wfirst)
+          print("  _mm256_storeu_si256(out + {0}, {1});".format(j,wfirst))
+      else:
+          secondshift = (32-firstshift)
+          wsecond = "_mm256_slli_epi32( w{0} , {1} ) ".format((firstword+1)%2,secondshift)
+          wfirstorsecond = " _mm256_or_si256 ({0},{1}) ".format(wfirst,wsecond)
+          wfirstorsecond = maskstr.format(wfirstorsecond)
+          print("  _mm256_storeu_si256(out + {0},\n    {1});".format(j,wfirstorsecond))
+    print("}");
+    print("")
+
+
+print("static avxpackblockfnc avxfuncPackArr[] = {")
+for bit in range(0,32):
+  print("&avxpackblock{0},".format(bit))
+print("&avxpackblock32")
+print("};")
+
+print("static avxpackblockfnc avxfuncPackMaskArr[] = {")
+for bit in range(0,32):
+  print("&avxpackblockmask{0},".format(bit))
+print("&avxpackblockmask32")
+print("};")
+
+
+print("static avxunpackblockfnc avxfuncUnpackArr[] = {")
+for bit in range(0,32):
+  print("&avxunpackblock{0},".format(bit))
+print("&avxunpackblock32")
+print("};")
+print("/** code generated by avxpacking.py ends here **/")
--- a/cpp/simdcomp/scripts/simdfor.py
+++ b/cpp/simdcomp/scripts/simdfor.py
@@ -0,0 +1,152 @@
+#!/usr/bin/env python3
+
+
+from math import ceil
+
+print("""
+/**
+* Blablabla
+*
+*/
+
+""");
+
+def mask(bit):
+  return str((1 << bit) - 1)
+
+for length in [32]:
+  print("""
+static __m128i  iunpackFOR0(__m128i initOffset, const __m128i *   _in , uint32_t *    _out) {
+    __m128i       *out = (__m128i*)(_out);
+    int i;
+    (void) _in;
+    for (i = 0; i < 8; ++i) {
+        _mm_store_si128(out++, initOffset);
+    	_mm_store_si128(out++, initOffset);
+        _mm_store_si128(out++, initOffset);
+        _mm_store_si128(out++, initOffset);
+    }
+
+    return initOffset;
+}
+
+  """)
+  print("""
+
+static void ipackFOR0(__m128i initOffset , const uint32_t *   _in , __m128i *  out  ) {
+    (void) initOffset;
+    (void) _in;
+    (void) out;
+}
+""") 
+  for bit in range(1,33):
+    offsetVar = " initOffset";
+    print("""  
+static void ipackFOR"""+str(bit)+"""(__m128i """+offsetVar+""", const uint32_t *   _in, __m128i *   out) {
+    const __m128i       *in = (const __m128i*)(_in);
+    __m128i    OutReg;
+
+      """);
+    
+    if (bit != 32):
+      print("    __m128i CurrIn = _mm_load_si128(in);");
+      print("    __m128i InReg = _mm_sub_epi32(CurrIn, initOffset);");
+    else:
+      print("    __m128i InReg = _mm_load_si128(in);");
+      print("    (void) initOffset;");
+
+
+    inwordpointer = 0
+    valuecounter = 0
+    for k in range(ceil((length * bit) / 32)):
+      if(valuecounter == length): break
+      for x in range(inwordpointer,32,bit):
+        if(x!=0) :
+          print("    OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, " + str(x) + "));");
+        else:
+          print("    OutReg = InReg; ");
+        if((x+bit>=32) ):
+          while(inwordpointer<32):
+            inwordpointer += bit
+          print("    _mm_store_si128(out, OutReg);");
+          print("");
+
+          if(valuecounter + 1 < length):
+            print("    ++out;")
+          inwordpointer -= 32;
+          if(inwordpointer>0):
+            print("    OutReg = _mm_srli_epi32(InReg, " + str(bit) + " - " + str(inwordpointer) + ");");
+        if(valuecounter + 1 < length):
+          print("    ++in;") 
+
+          if (bit != 32):
+            print("    CurrIn = _mm_load_si128(in);");
+            print("    InReg = _mm_sub_epi32(CurrIn, initOffset);");
+          else:
+            print("    InReg = _mm_load_si128(in);");
+          print("");
+        valuecounter = valuecounter + 1
+        if(valuecounter == length): break
+    assert(valuecounter == length)
+    print("\n}\n\n""")
+
+  for bit in range(1,32):
+    offsetVar = " initOffset";
+    print("""\n
+static __m128i iunpackFOR"""+str(bit)+"""(__m128i """+offsetVar+""", const  __m128i*   in, uint32_t *   _out) {
+      """);
+    print("""    __m128i*   out = (__m128i*)(_out);
+    __m128i    InReg = _mm_load_si128(in);
+    __m128i    OutReg;    
+    __m128i     tmp;
+    const __m128i mask =  _mm_set1_epi32((1U<<"""+str(bit)+""")-1);
+
+    """);
+
+    MainText = "";
+
+    MainText += "\n";
+    inwordpointer = 0
+    valuecounter = 0
+    for k in range(ceil((length * bit) / 32)):
+      for x in range(inwordpointer,32,bit):
+        if(valuecounter == length): break
+        if (x > 0):
+          MainText += "    tmp = _mm_srli_epi32(InReg," + str(x) +");\n"; 
+        else:
+          MainText += "    tmp = InReg;\n"; 
+        if(x+bit<32):
+          MainText += "    OutReg = _mm_and_si128(tmp, mask);\n";
+        else:
+          MainText += "    OutReg = tmp;\n";        
+        if((x+bit>=32) ):      
+          while(inwordpointer<32):
+            inwordpointer += bit
+          if(valuecounter + 1 < length):
+             MainText += "    ++in;"
+             MainText += "    InReg = _mm_load_si128(in);\n";
+          inwordpointer -= 32;
+          if(inwordpointer>0):
+            MainText += "    OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, " + str(bit) + "-" + str(inwordpointer) + "), mask));\n\n";
+        if (bit != 32):
+          MainText += "    OutReg = _mm_add_epi32(OutReg, initOffset);\n"; 
+        MainText += "    _mm_store_si128(out++, OutReg);\n\n"; 
+        MainText += "";
+        valuecounter = valuecounter + 1
+        if(valuecounter == length): break
+    assert(valuecounter == length)
+    print(MainText)
+    print("    return initOffset;");
+    print("\n}\n\n")
+  print("""
+static __m128i iunpackFOR32(__m128i initvalue , const  __m128i*   in, uint32_t *    _out) {
+	__m128i * mout = (__m128i *)_out;
+	__m128i invec;
+	size_t k;
+	for(k = 0; k < 128/4; ++k) {
+		invec =  _mm_load_si128(in++);
+	    _mm_store_si128(mout++, invec);
+	}
+	return invec;
+}
+  """)
--- a/cpp/simdcomp/simdcomp.def
+++ b/cpp/simdcomp/simdcomp.def
@@ -0,0 +1,40 @@
+EXPORTS
+	simdpack
+	simdpackwithoutmask
+	simdunpack
+	bits
+	maxbits
+	maxbits_length
+	simdmin
+	simdmin_length
+	simdmaxmin
+	simdmaxmin_length
+	simdmaxbitsd1
+	simdmaxbitsd1_length
+	simdpackd1
+	simdpackwithoutmaskd1
+	simdunpackd1
+	simdsearchd1
+	simdsearchwithlengthd1
+	simdselectd1
+	simdpackFOR
+	simdselectFOR
+	simdsearchwithlengthFOR
+	simdunpackFOR
+	simdmin_length
+	simdmaxmin
+	simdmaxmin_length
+	simdpack_length
+	simdpackFOR_length
+	simdunpackFOR_length
+	simdpack_shortlength
+	simdfastsetFOR
+	simdfastset
+	simdfastsetd1
+	simdunpack_length
+	simdunpack_shortlength
+	simdsearchwithlengthFOR
+	simdscand1
+	simdfastsetd1fromprevious
+	simdfastsetd1
+
--- a/cpp/simdcomp/src/avxbitpacking.c
+++ b/cpp/simdcomp/src/avxbitpacking.c
--- a/cpp/simdcomp/src/simdbitpacking.c
+++ b/cpp/simdcomp/src/simdbitpacking.c
--- a/cpp/simdcomp/src/simdcomputil.c
+++ b/cpp/simdcomp/src/simdcomputil.c
@@ -0,0 +1,234 @@
+/**
+ * This code is released under a BSD License.
+ */
+
+#include "simdcomputil.h"
+#ifdef __SSE4_1__
+#include <smmintrin.h>
+#endif
+#include <assert.h>
+
+#define Delta(curr, prev) \
+    _mm_sub_epi32(curr, \
+            _mm_or_si128(_mm_slli_si128(curr, 4), _mm_srli_si128(prev, 12)))
+
+/* returns the integer logarithm of v (bit width) */
+uint32_t bits(const uint32_t v) {
+#ifdef _MSC_VER
+    unsigned long answer;
+    if (v == 0) {
+        return 0;
+    }
+    _BitScanReverse(&answer, v);
+    return answer + 1;
+#else
+    return v == 0 ? 0 : 32 - __builtin_clz(v); /* assume GCC-like compiler if not microsoft */
+#endif
+}
+
+
+
+static uint32_t maxbitas32int(const __m128i accumulator) {
+	const __m128i _tmp1 = _mm_or_si128(_mm_srli_si128(accumulator, 8), accumulator); /* (A,B,C,D) xor (0,0,A,B) = (A,B,C xor A,D xor B)*/
+	const __m128i _tmp2 = _mm_or_si128(_mm_srli_si128(_tmp1, 4), _tmp1); /*  (A,B,C xor A,D xor B) xor  (0,0,0,C xor A)*/
+	uint32_t ans =  _mm_cvtsi128_si32(_tmp2);
+	return bits(ans);
+}
+
+SIMDCOMP_PURE uint32_t maxbits(const uint32_t * begin) {
+	    const __m128i* pin = (const __m128i*)(begin);
+	    __m128i accumulator = _mm_loadu_si128(pin);
+	    uint32_t k = 1;
+	    for(; 4*k < SIMDBlockSize; ++k) {
+	    	__m128i newvec = _mm_loadu_si128(pin+k);
+	        accumulator = _mm_or_si128(accumulator,newvec);
+	    }
+	    return maxbitas32int(accumulator);
+}
+static uint32_t orasint(const __m128i accumulator) {
+	const __m128i _tmp1 = _mm_or_si128(_mm_srli_si128(accumulator, 8), accumulator); /* (A,B,C,D) xor (0,0,A,B) = (A,B,C xor A,D xor B)*/
+	const __m128i _tmp2 = _mm_or_si128(_mm_srli_si128(_tmp1, 4), _tmp1); /*  (A,B,C xor A,D xor B) xor  (0,0,0,C xor A)*/
+	return  _mm_cvtsi128_si32(_tmp2);
+}
+
+#ifdef __SSE4_1__
+
+static uint32_t minasint(const __m128i accumulator) {
+	const __m128i _tmp1 = _mm_min_epu32(_mm_srli_si128(accumulator, 8), accumulator); /* (A,B,C,D) xor (0,0,A,B) = (A,B,C xor A,D xor B)*/
+	const __m128i _tmp2 = _mm_min_epu32(_mm_srli_si128(_tmp1, 4), _tmp1); /*  (A,B,C xor A,D xor B) xor  (0,0,0,C xor A)*/
+	return  _mm_cvtsi128_si32(_tmp2);
+}
+
+static uint32_t maxasint(const __m128i accumulator) {
+	const __m128i _tmp1 = _mm_max_epu32(_mm_srli_si128(accumulator, 8), accumulator); /* (A,B,C,D) xor (0,0,A,B) = (A,B,C xor A,D xor B)*/
+	const __m128i _tmp2 = _mm_max_epu32(_mm_srli_si128(_tmp1, 4), _tmp1); /*  (A,B,C xor A,D xor B) xor  (0,0,0,C xor A)*/
+	return  _mm_cvtsi128_si32(_tmp2);
+}
+
+uint32_t simdmin(const uint32_t * in) {
+    const __m128i* pin = (const __m128i*)(in);
+    __m128i accumulator =  _mm_loadu_si128(pin);
+     uint32_t k = 1;
+     for(; 4*k < SIMDBlockSize; ++k) {
+    	 __m128i newvec = _mm_loadu_si128(pin+k);
+         accumulator = _mm_min_epu32(accumulator,newvec);
+     }
+     return minasint(accumulator);
+}
+
+void simdmaxmin(const uint32_t * in, uint32_t * getmin, uint32_t * getmax) {
+    const __m128i* pin = (const __m128i*)(in);
+    __m128i minaccumulator =  _mm_loadu_si128(pin);
+    __m128i maxaccumulator =  minaccumulator;
+    uint32_t k = 1;
+     for(; 4*k < SIMDBlockSize; ++k) {
+    	 __m128i newvec = _mm_loadu_si128(pin+k);
+         minaccumulator = _mm_min_epu32(minaccumulator,newvec);
+         maxaccumulator = _mm_max_epu32(maxaccumulator,newvec);
+     }
+     *getmin = minasint(minaccumulator);
+     *getmax = maxasint(maxaccumulator);
+}
+
+
+uint32_t simdmin_length(const uint32_t * in, uint32_t length) {
+	uint32_t currentmin = 0xFFFFFFFF;
+	uint32_t lengthdividedby4 = length / 4;
+	uint32_t offset = lengthdividedby4 * 4;
+	uint32_t k;
+	if (lengthdividedby4 > 0) {
+		const __m128i* pin = (const __m128i*)(in);
+		__m128i accumulator = _mm_loadu_si128(pin);
+		k = 1;
+		for(; 4*k < lengthdividedby4 * 4; ++k) {
+			__m128i newvec = _mm_loadu_si128(pin+k);
+			accumulator = _mm_min_epu32(accumulator,newvec);
+		}
+		currentmin = minasint(accumulator);
+	}
+	for (k = offset; k < length; ++k)
+		if (in[k] < currentmin)
+			currentmin = in[k];
+	return currentmin;
+}
+
+void simdmaxmin_length(const uint32_t * in, uint32_t length, uint32_t * getmin, uint32_t * getmax) {
+	uint32_t lengthdividedby4 = length / 4;
+	uint32_t offset = lengthdividedby4 * 4;
+	uint32_t k;
+	*getmin = 0xFFFFFFFF;
+	*getmax = 0;
+	if (lengthdividedby4 > 0) {
+		const __m128i* pin = (const __m128i*)(in);
+		__m128i minaccumulator = _mm_loadu_si128(pin);
+		__m128i maxaccumulator = minaccumulator;
+		k = 1;
+		for(; 4*k < lengthdividedby4 * 4; ++k) {
+			__m128i newvec = _mm_loadu_si128(pin+k);
+			minaccumulator = _mm_min_epu32(minaccumulator,newvec);
+			maxaccumulator = _mm_max_epu32(maxaccumulator,newvec);
+		}
+		*getmin = minasint(minaccumulator);
+		*getmax = maxasint(maxaccumulator);
+	}
+	for (k = offset; k < length; ++k) {
+		if (in[k] < *getmin)
+			*getmin = in[k];
+		if (in[k] > *getmax)
+			*getmax = in[k];
+	}
+}
+
+#endif
+
+SIMDCOMP_PURE uint32_t maxbits_length(const uint32_t * in,uint32_t length) {
+	  uint32_t k;
+	  uint32_t lengthdividedby4 = length / 4;
+	  uint32_t offset = lengthdividedby4 * 4;
+	  uint32_t bigxor = 0;
+	  if(lengthdividedby4 > 0) {
+		    const __m128i* pin = (const __m128i*)(in);
+		    __m128i accumulator = _mm_loadu_si128(pin);
+		    k = 1;
+		    for(; 4*k < 4*lengthdividedby4; ++k) {
+		    	__m128i newvec = _mm_loadu_si128(pin+k);
+		        accumulator = _mm_or_si128(accumulator,newvec);
+		    }
+		    bigxor = orasint(accumulator);
+	  }
+	  for(k = offset; k < length; ++k)
+		  bigxor |= in[k];
+	  return bits(bigxor);
+}
+
+
+/* maxbit over 128 integers (SIMDBlockSize) with provided initial value */
+uint32_t simdmaxbitsd1(uint32_t initvalue, const uint32_t * in) {
+    __m128i  initoffset = _mm_set1_epi32 (initvalue);
+    const __m128i* pin = (const __m128i*)(in);
+    __m128i newvec = _mm_loadu_si128(pin);
+    __m128i accumulator = Delta(newvec , initoffset);
+    __m128i oldvec = newvec;
+    uint32_t k = 1;
+    for(; 4*k < SIMDBlockSize; ++k) {
+        newvec = _mm_loadu_si128(pin+k);
+        accumulator = _mm_or_si128(accumulator,Delta(newvec , oldvec));
+        oldvec = newvec;
+    }
+    initoffset = oldvec;
+    return maxbitas32int(accumulator);
+}
+
+
+/* maxbit over |length| integers with provided initial value */
+uint32_t simdmaxbitsd1_length(uint32_t initvalue, const uint32_t * in,
+                uint32_t length) {
+    __m128i newvec;
+    __m128i oldvec;
+    __m128i initoffset;
+    __m128i accumulator;
+    const __m128i *pin;
+    uint32_t tmparray[4];
+    uint32_t k = 1;
+    uint32_t acc;
+
+    assert(length > 0);
+
+    pin = (const __m128i *)(in);
+    initoffset = _mm_set1_epi32(initvalue);
+    switch (length) {
+      case 1:
+        newvec = _mm_set1_epi32(in[0]);
+        break;
+      case 2:
+        newvec = _mm_setr_epi32(in[0], in[1], in[1], in[1]);
+        break;
+      case 3:
+        newvec = _mm_setr_epi32(in[0], in[1], in[2], in[2]);
+        break;
+      default:
+        newvec = _mm_loadu_si128(pin);
+        break;
+    }
+    accumulator = Delta(newvec, initoffset);
+    oldvec = newvec;
+
+    /* process 4 integers and build an accumulator */
+    while (k * 4 + 4 <= length) {
+        newvec = _mm_loadu_si128(pin + k);
+        accumulator = _mm_or_si128(accumulator, Delta(newvec, oldvec));
+        oldvec = newvec;
+        k++;
+    }
+
+    /* extract the accumulator as an integer */
+    _mm_storeu_si128((__m128i *)(tmparray), accumulator);
+    acc = tmparray[0] | tmparray[1] | tmparray[2] | tmparray[3];
+
+    /* now process the remaining integers */
+    for (k *= 4; k < length; k++)
+        acc |= in[k] - (k == 0 ? initvalue : in[k - 1]);
+
+    /* return the number of bits */
+    return bits(acc);
+}
--- a/cpp/simdcomp/src/simdfor.c
+++ b/cpp/simdcomp/src/simdfor.c
--- a/cpp/simdcomp/src/simdintegratedbitpacking.c
+++ b/cpp/simdcomp/src/simdintegratedbitpacking.c
--- a/cpp/simdcomp/src/simdpackedsearch.c
+++ b/cpp/simdcomp/src/simdpackedsearch.c
--- a/cpp/simdcomp/src/simdpackedselect.c
+++ b/cpp/simdcomp/src/simdpackedselect.c
--- a/cpp/simdcomp/tests/unit.c
+++ b/cpp/simdcomp/tests/unit.c
@@ -0,0 +1,900 @@
+/**
+ * This code is released under a BSD License.
+ */
+#include <assert.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include "simdcomp.h"
+
+
+
+int testshortpack() {
+	int bit;
+	size_t i;
+	size_t length;
+	__m128i * bb;
+	srand(0);
+	printf("testshortpack\n");
+	for (bit = 0; bit < 32; ++bit) {
+		const size_t N = 128;
+		uint32_t * data = malloc(N * sizeof(uint32_t));
+		uint32_t * backdata = malloc(N * sizeof(uint32_t));
+		uint32_t * buffer = malloc((2 * N + 1024) * sizeof(uint32_t));
+
+		for (i = 0; i < N; ++i) {
+			data[i] = rand() & ((1 << bit) - 1);
+		}
+		for (length = 0; length <= N; ++length) {
+			for (i = 0; i < N; ++i) {
+				backdata[i] = 0;
+			}
+			bb = simdpack_shortlength(data, length, (__m128i *) buffer,
+					bit);
+			if((bb - (__m128i *) buffer) * sizeof(__m128i) != (unsigned) simdpack_compressedbytes(length,bit)) {
+			 printf("bug\n");
+			 return -1;
+			}
+			simdunpack_shortlength((__m128i *) buffer, length,
+					backdata, bit);
+			for (i = 0; i < length; ++i) {
+
+				if (data[i] != backdata[i]) {
+				    printf("bug\n");
+					return -1;
+				}
+			}
+		}
+		free(data);
+		free(backdata);
+		free(buffer);
+	}
+	return 0;
+}
+
+int testlongpack() {
+	int bit;
+	size_t i;
+	size_t length;
+	__m128i * bb;
+	srand(0);
+	printf("testlongpack\n");
+	for (bit = 0; bit < 32; ++bit) {
+		const size_t N = 2048;
+		uint32_t * data = malloc(N * sizeof(uint32_t));
+		uint32_t * backdata = malloc(N * sizeof(uint32_t));
+		uint32_t * buffer = malloc((2 * N + 1024) * sizeof(uint32_t));
+
+		for (i = 0; i < N; ++i) {
+			data[i] = rand() & ((1 << bit) - 1);
+		}
+		for (length = 0; length <= N; ++length) {
+			for (i = 0; i < N; ++i) {
+				backdata[i] = 0;
+			}
+			bb = simdpack_length(data, length, (__m128i *) buffer,
+					bit);
+			if((bb - (__m128i *) buffer) * sizeof(__m128i) != (unsigned) simdpack_compressedbytes(length,bit)) {
+			 printf("bug\n");
+			 return -1;
+			}
+			simdunpack_length((__m128i *) buffer, length,
+					backdata, bit);
+			for (i = 0; i < length; ++i) {
+
+				if (data[i] != backdata[i]) {
+				    printf("bug\n");
+					return -1;
+				}
+			}
+		}
+		free(data);
+		free(backdata);
+		free(buffer);
+	}
+	return 0;
+}
+
+
+
+int testset() {
+	int bit;
+	size_t i;
+	const size_t N = 128;
+	uint32_t * data = malloc(N * sizeof(uint32_t));
+	uint32_t * backdata = malloc(N * sizeof(uint32_t));
+	uint32_t * buffer = malloc((2 * N + 1024) * sizeof(uint32_t));
+
+	srand(0);
+
+	for (bit = 0; bit < 32; ++bit) {
+		printf("simple set %d \n",bit);
+
+		for (i = 0; i < N; ++i) {
+			data[i] = rand() & ((1 << bit) - 1);
+		}
+		for (i = 0; i < N; ++i) {
+			backdata[i] = 0;
+		}
+		simdpack(data, (__m128i *) buffer, bit);
+		simdunpack((__m128i *) buffer, backdata, bit);
+		for (i = 0; i < N; ++i) {
+			if (data[i] != backdata[i]) {
+			    printf("bug\n");
+				return -1;
+			}
+		}
+
+		for(i = N  ; i > 0; i--) {
+			simdfastset((__m128i *) buffer, bit, data[N - i], i - 1);
+		}
+		simdunpack((__m128i *) buffer, backdata, bit);
+		for (i = 0; i < N; ++i) {
+			if (data[i] != backdata[N - i - 1]) {
+			    printf("bug\n");
+				return -1;
+			}
+		}
+		simdpack(data, (__m128i *) buffer, bit);
+		for(i = 1  ; i <= N; i++) {
+			simdfastset((__m128i *) buffer, bit, data[i - 1], i - 1);
+		}
+		simdunpack((__m128i *) buffer, backdata, bit);
+		for (i = 0; i < N; ++i) {
+			if (data[i] != backdata[i]) {
+			    printf("bug\n");
+				return -1;
+			}
+		}
+
+	}
+	free(data);
+	free(backdata);
+	free(buffer);
+
+	return 0;
+}
+
+#ifdef __SSE4_1__
+
+int testsetd1() {
+	int bit;
+	size_t i;
+	uint32_t newvalue;
+	const size_t N = 128;
+	uint32_t * data = malloc(N * sizeof(uint32_t));
+	uint32_t * datazeroes = malloc(N * sizeof(uint32_t));
+
+	uint32_t * backdata = malloc(N * sizeof(uint32_t));
+	uint32_t * buffer = malloc((2 * N + 1024) * sizeof(uint32_t));
+
+	srand(0);
+	for (bit = 0; bit < 32; ++bit) {
+		printf("simple set d1 %d \n",bit);
+		data[0] = rand() & ((1 << bit) - 1);
+		datazeroes[0] = 0;
+
+		for (i = 1; i < N; ++i) {
+			data[i] = data[i - 1] + (rand() & ((1 << bit) - 1));
+			datazeroes[i] = 0;
+		}
+		for (i = 0; i < N; ++i) {
+			backdata[i] = 0;
+		}
+		simdpackd1(0,datazeroes, (__m128i *) buffer, bit);
+ 	    for(i = 1  ; i <= N; i++) {
+			simdfastsetd1(0,(__m128i *) buffer, bit, data[i - 1], i - 1);
+			newvalue = simdselectd1(0, (const __m128i *) buffer, bit,i - 1);
+			if( newvalue != data[i-1] ) {
+				printf("bad set-select\n");
+				return -1;
+			}
+		}
+		simdunpackd1(0,(__m128i *) buffer, backdata, bit);
+		for (i = 0; i < N; ++i) {
+			if (data[i] != backdata[i])
+				return -1;
+		}
+	}
+	free(data);
+	free(backdata);
+	free(buffer);
+        free(datazeroes);
+	return 0;
+}
+#endif
+
+int testsetFOR() {
+	int bit;
+	size_t i;
+	uint32_t newvalue;
+	const size_t N = 128;
+	uint32_t * data = malloc(N * sizeof(uint32_t));
+	uint32_t * datazeroes = malloc(N * sizeof(uint32_t));
+
+	uint32_t * backdata = malloc(N * sizeof(uint32_t));
+	uint32_t * buffer = malloc((2 * N + 1024) * sizeof(uint32_t));
+
+	srand(0);
+	for (bit = 0; bit < 32; ++bit) {
+		printf("simple set FOR %d \n",bit);
+		for (i = 0; i < N; ++i) {
+			data[i] = (rand() & ((1 << bit) - 1));
+			datazeroes[i] = 0;
+		}
+		for (i = 0; i < N; ++i) {
+			backdata[i] = 0;
+		}
+		simdpackFOR(0,datazeroes, (__m128i *) buffer, bit);
+ 	    for(i = 1  ; i <= N; i++) {
+ 	    	simdfastsetFOR(0,(__m128i *) buffer, bit, data[i - 1], i - 1);
+			newvalue = simdselectFOR(0, (const __m128i *) buffer, bit,i - 1);
+			if( newvalue != data[i-1] ) {
+				printf("bad set-select\n");
+				return -1;
+			}
+		}
+		simdunpackFOR(0,(__m128i *) buffer, backdata, bit);
+		for (i = 0; i < N; ++i) {
+			if (data[i] != backdata[i])
+				return -1;
+		}
+	}
+	free(data);
+	free(backdata);
+	free(buffer);
+        free(datazeroes);
+	return 0;
+}
+
+int testshortFORpack() {
+	int bit;
+	size_t i;
+	__m128i * rb;
+	size_t length;
+	uint32_t offset = 7;
+	srand(0);
+	for (bit = 0; bit < 32; ++bit) {
+		const size_t N = 128;
+		uint32_t * data = malloc(N * sizeof(uint32_t));
+		uint32_t * backdata = malloc(N * sizeof(uint32_t));
+		uint32_t * buffer = malloc((2 * N + 1024) * sizeof(uint32_t));
+
+		for (i = 0; i < N; ++i) {
+			data[i] = (rand() & ((1 << bit) - 1)) + offset;
+		}
+		for (length = 0; length <= N; ++length) {
+			for (i = 0; i < N; ++i) {
+				backdata[i] = 0;
+			}
+			rb = simdpackFOR_length(offset,data, length, (__m128i *) buffer,
+					bit);
+		    if(((rb - (__m128i *) buffer)*sizeof(__m128i)) != (unsigned) simdpackFOR_compressedbytes(length,bit)) {
+		      return -1;
+		    }
+			simdunpackFOR_length(offset,(__m128i *) buffer, length,
+					backdata, bit);
+			for (i = 0; i < length; ++i) {
+
+				if (data[i] != backdata[i])
+					return -1;
+			}
+		}
+		free(data);
+		free(backdata);
+		free(buffer);
+	}
+	return 0;
+}
+
+
+#ifdef __AVX2__
+
+int testbabyavx() {
+	int bit;
+	int trial;
+	unsigned int i,j;
+	const size_t N = AVXBlockSize;
+	srand(0);
+	printf("testbabyavx\n");
+	printf("bit = ");
+	for (bit = 0; bit < 32; ++bit) {
+		printf(" %d ",bit);
+		fflush(stdout);
+		for(trial = 0; trial < 100; ++trial) {
+			uint32_t * data = malloc(N * sizeof(uint32_t)+ 64 * sizeof(uint32_t));
+			uint32_t * backdata = malloc(N * sizeof(uint32_t) + 64 * sizeof(uint32_t) );
+			__m256i * buffer = malloc((2 * N + 1024) * sizeof(uint32_t) + 32);
+
+			for (i = 0; i < N; ++i) {
+				data[i] = rand() & ((uint32_t)(1 << bit) - 1);
+			}
+			for (i = 0; i < N; ++i) {
+				backdata[i] = 0;
+			}
+            if(avxmaxbits(data) != maxbits_length(data,N)) {
+            	printf("avxmaxbits is buggy\n");
+				return -1;
+            }
+
+			avxpackwithoutmask(data, buffer, bit);
+			avxunpack(buffer, backdata, bit);
+			for (i = 0; i < AVXBlockSize; ++i) {
+				if (data[i] != backdata[i]) {
+					printf("bug\n");
+					for (j = 0; j < N; ++j) {
+						if (data[j] != backdata[j]) {
+							printf("data[%d]=%d v.s. backdata[%d]=%d\n",j,data[j],j,backdata[j]);
+						} else {
+							printf("data[%d]=%d\n",j,data[j]);
+						}
+					}
+					return -1;
+				}
+			}
+			free(data);
+			free(backdata);
+			free(buffer);
+		}
+	}
+	printf("\n");
+	return 0;
+}
+
+int testavx2() {
+    int N = 5000 * AVXBlockSize, gap;
+    __m256i * buffer = malloc(AVXBlockSize * sizeof(uint32_t));
+    uint32_t * datain = malloc(N * sizeof(uint32_t));
+    uint32_t * backbuffer = malloc(AVXBlockSize * sizeof(uint32_t));
+    for (gap = 1; gap <= 387420489; gap *= 3) {
+        int k;
+        printf(" gap = %u \n", gap);
+        for (k = 0; k < N; ++k)
+            datain[k] = k * gap;
+        for (k = 0; k * AVXBlockSize < N; ++k) {
+            /*
+               First part works for general arrays (sorted or unsorted)
+            */
+            int j;
+       	    /* we compute the bit width */
+            const uint32_t b = avxmaxbits(datain + k * AVXBlockSize);
+            if(avxmaxbits(datain + k * AVXBlockSize) != maxbits_length(datain + k * AVXBlockSize,AVXBlockSize)) {
+            	printf("avxmaxbits is buggy %d %d \n",
+            			avxmaxbits(datain + k * AVXBlockSize),
+						maxbits_length(datain + k * AVXBlockSize,AVXBlockSize));
+				return -1;
+            }
+            printf("bit width = %d\n",b);
+
+
+            /* we read 256 integers at "datain + k * AVXBlockSize" and
+               write b 256-bit vectors at "buffer" */
+            avxpackwithoutmask(datain + k * AVXBlockSize, buffer, b);
+            /* we read back b1 128-bit vectors at "buffer" and write 128 integers at backbuffer */
+			avxunpack(buffer, backbuffer, b);/* uncompressed */
+			for (j = 0; j < AVXBlockSize; ++j) {
+				if (backbuffer[j] != datain[k * AVXBlockSize + j]) {
+					int i;
+					printf("bug in avxpack\n");
+					for(i = 0; i < AVXBlockSize; ++i) {
+						printf("data[%d]=%d got back %d %s\n",i,
+								datain[k * AVXBlockSize + i],backbuffer[i],
+								datain[k * AVXBlockSize + i]!=backbuffer[i]?"bug":"");
+					}
+					return -2;
+				}
+			}
+        }
+    }
+    free(buffer);
+    free(datain);
+    free(backbuffer);
+    printf("Code looks good.\n");
+    return 0;
+}
+#endif /* avx2 */
+
+int test() {
+    int N = 5000 * SIMDBlockSize, gap;
+    __m128i * buffer = malloc(SIMDBlockSize * sizeof(uint32_t));
+    uint32_t * datain = malloc(N * sizeof(uint32_t));
+    uint32_t * backbuffer = malloc(SIMDBlockSize * sizeof(uint32_t));
+    for (gap = 1; gap <= 387420489; gap *= 3) {
+        int k;
+        printf(" gap = %u \n", gap);
+        for (k = 0; k < N; ++k)
+            datain[k] = k * gap;
+        for (k = 0; k * SIMDBlockSize < N; ++k) {
+            /*
+               First part works for general arrays (sorted or unsorted)
+            */
+            int j;
+       	    /* we compute the bit width */
+            const uint32_t b = maxbits(datain + k * SIMDBlockSize);
+            /* we read 128 integers at "datain + k * SIMDBlockSize" and
+               write b 128-bit vectors at "buffer" */
+            simdpackwithoutmask(datain + k * SIMDBlockSize, buffer, b);
+            /* we read back b1 128-bit vectors at "buffer" and write 128 integers at backbuffer */
+            simdunpack(buffer, backbuffer, b);/* uncompressed */
+            for (j = 0; j < SIMDBlockSize; ++j) {
+                if (backbuffer[j] != datain[k * SIMDBlockSize + j]) {
+                    printf("bug in simdpack\n");
+                    return -2;
+                }
+            }
+
+	    {
+                /*
+                 next part assumes that the data is sorted (uses differential coding)
+                */
+                uint32_t offset = 0;
+                /* we compute the bit width */
+                const uint32_t b1 = simdmaxbitsd1(offset,
+                    datain + k * SIMDBlockSize);
+               /* we read 128 integers at "datain + k * SIMDBlockSize" and
+                  write b1 128-bit vectors at "buffer" */
+               simdpackwithoutmaskd1(offset, datain + k * SIMDBlockSize, buffer,
+                    b1);
+               /* we read back b1 128-bit vectors at "buffer" and write 128 integers at backbuffer */
+               simdunpackd1(offset, buffer, backbuffer, b1);
+               for (j = 0; j < SIMDBlockSize; ++j) {
+                   if (backbuffer[j] != datain[k * SIMDBlockSize + j]) {
+                       printf("bug in simdpack d1\n");
+                       return -3;
+                   }
+               }
+               offset = datain[k * SIMDBlockSize + SIMDBlockSize - 1];
+	    }
+        }
+    }
+    free(buffer);
+    free(datain);
+    free(backbuffer);
+    printf("Code looks good.\n");
+    return 0;
+}
+
+#ifdef __SSE4_1__
+int testFOR() {
+    int N = 5000 * SIMDBlockSize, gap;
+    __m128i * buffer = malloc(SIMDBlockSize * sizeof(uint32_t));
+    uint32_t * datain = malloc(N * sizeof(uint32_t));
+    uint32_t * backbuffer = malloc(SIMDBlockSize * sizeof(uint32_t));
+    uint32_t tmax, tmin, tb;
+    for (gap = 1; gap <= 387420489; gap *= 2) {
+        int k;
+        printf(" gap = %u \n", gap);
+        for (k = 0; k < N; ++k)
+            datain[k] = k * gap;
+        for (k = 0; k * SIMDBlockSize < N; ++k) {
+            int j;
+            simdmaxmin_length(datain + k * SIMDBlockSize,SIMDBlockSize,&tmin,&tmax);
+       	    /* we compute the bit width */
+            tb  = bits(tmax - tmin);
+
+
+            /* we read 128 integers at "datain + k * SIMDBlockSize" and
+               write b 128-bit vectors at "buffer" */
+            simdpackFOR(tmin,datain + k * SIMDBlockSize, buffer, tb);
+
+            for (j = 0; j < SIMDBlockSize; ++j) {
+                        uint32_t selectedvalue = simdselectFOR(tmin,buffer,tb,j);
+                    	if (selectedvalue != datain[k * SIMDBlockSize + j]) {
+                            printf("bug in simdselectFOR\n");
+                            return -3;
+                        }
+            }
+            /* we read back b1 128-bit vectors at "buffer" and write 128 integers at backbuffer */
+            simdunpackFOR(tmin,buffer, backbuffer, tb);/* uncompressed */
+            for (j = 0; j < SIMDBlockSize; ++j) {
+            	if (backbuffer[j] != datain[k * SIMDBlockSize + j]) {
+                    printf("bug in simdpackFOR\n");
+                    return -2;
+                }
+            }
+        }
+    }
+    free(buffer);
+    free(datain);
+    free(backbuffer);
+    printf("Code looks good.\n");
+    return 0;
+}
+#endif
+
+#define MAX 300
+int test_simdmaxbitsd1_length() {
+    uint32_t result, buffer[MAX + 1];
+    int i, j;
+
+    memset(&buffer[0], 0xff, sizeof(buffer));
+
+    /* this test creates buffers of different length; each buffer is
+     * initialized to result in the following deltas:
+     * length 1: 2
+     * length 2: 1 2
+     * length 3: 1 1 2
+     * length 4: 1 1 1 2
+     * length 5: 1 1 1 1 2
+     * etc. Each sequence's "maxbits" is 2. */
+    for (i = 0; i < MAX; i++) {
+      for (j = 0; j < i; j++)
+        buffer[j] = j + 1;
+      buffer[i] = i + 2;
+
+      result = simdmaxbitsd1_length(0, &buffer[0], i + 1);
+      if (result != 2) {
+        printf("simdmaxbitsd1_length: unexpected result %u in loop %d\n",
+                result, i);
+        return -1;
+      }
+    }
+    printf("simdmaxbitsd1_length: ok\n");
+    return 0;
+}
+
+int uint32_cmp(const void *a, const void *b)
+{
+    const uint32_t *ia = (const uint32_t *)a;
+    const uint32_t *ib = (const uint32_t *)b;
+    if(*ia < *ib)
+    	return -1;
+    else if (*ia > *ib)
+    	return 1;
+    return 0;
+}
+
+#ifdef __SSE4_1__
+int test_simdpackedsearch() {
+    uint32_t buffer[128];
+    uint32_t result = 0;
+    int b, i;
+    uint32_t init = 0;
+    __m128i initial = _mm_set1_epi32(init);
+
+    /* initialize the buffer */
+    for (i = 0; i < 128; i++)
+        buffer[i] = (uint32_t)(i + 1);
+
+    /* this test creates delta encoded buffers with different bits, then
+     * performs lower bound searches for each key */
+    for (b = 1; b <= 32; b++) {
+        uint32_t out[128];
+        /* delta-encode to 'i' bits */
+        simdpackwithoutmaskd1(init, buffer, (__m128i *)out, b);
+        initial = _mm_setzero_si128();
+        printf("simdsearchd1: %d bits\n", b);
+
+        /* now perform the searches */
+        initial = _mm_set1_epi32(init);
+        assert(simdsearchd1(&initial, (__m128i *)out, b, 0, &result) == 0);
+        assert(result > 0);
+
+        for (i = 1; i <= 128; i++) {
+        	initial = _mm_set1_epi32(init);
+            assert(simdsearchd1(&initial, (__m128i *)out, b,
+                                    (uint32_t)i, &result) == i - 1);
+            assert(result == (unsigned)i);
+        }
+        initial = _mm_set1_epi32(init);
+        assert(simdsearchd1(&initial, (__m128i *)out, b, 200, &result)
+                        == 128);
+        assert(result > 200);
+    }
+    printf("simdsearchd1: ok\n");
+    return 0;
+}
+
+int test_simdpackedsearchFOR() {
+    uint32_t buffer[128];
+    uint32_t result = 0;
+    int b;
+    uint32_t i;
+    uint32_t maxv, tmin, tmax, tb;
+    uint32_t out[128];
+
+    /* this test creates delta encoded buffers with different bits, then
+     * performs lower bound searches for each key */
+    for (b = 1; b <= 32; b++) {
+        /* initialize the buffer */
+    	maxv = (b == 32)
+    			? 0xFFFFFFFF
+    					: ((1U<<b) - 1);
+        for (i = 0; i < 128; i++)
+            buffer[i] = maxv * (i + 1) / 128;
+        simdmaxmin_length(buffer,SIMDBlockSize,&tmin,&tmax);
+   	    /* we compute the bit width */
+        tb  = bits(tmax - tmin);
+        /* delta-encode to 'i' bits */
+        simdpackFOR(tmin, buffer, (__m128i *)out, tb);
+        printf("simdsearchd1: %d bits\n", b);
+
+        /* now perform the searches */
+        for (i = 0; i < 128; i++) {
+        	assert(buffer[i] == simdselectFOR(tmin, (__m128i *)out, tb,i));
+        }
+        for (i = 0; i < 128; i++) {
+            int x = simdsearchwithlengthFOR(tmin, (__m128i *)out, tb,
+                                    128,buffer[i], &result) ;
+            assert(simdselectFOR(tmin, (__m128i *)out, tb,x) == buffer[x]);
+            assert(simdselectFOR(tmin, (__m128i *)out, tb,x) == result);
+            assert(buffer[x] == result);
+            assert(result == buffer[i]);
+            assert(buffer[x] == buffer[i]);
+        }
+    }
+    printf("simdsearchFOR: ok\n");
+    return 0;
+}
+
+int test_simdpackedsearch_advanced() {
+    uint32_t buffer[128];
+    uint32_t backbuffer[128];
+	uint32_t out[128];
+    uint32_t result = 0;
+    uint32_t b, i;
+    uint32_t init = 0;
+    __m128i initial = _mm_set1_epi32(init);
+
+
+    /* this test creates delta encoded buffers with different bits, then
+     * performs lower bound searches for each key */
+    for (b = 0; b <= 32; b++) {
+    	uint32_t prev = init;
+        /* initialize the buffer */
+        for (i = 0; i < 128; i++) {
+            buffer[i] =  ((uint32_t)(1431655765 * i + 0xFFFFFFFF)) ;
+            if(b < 32) buffer[i] %= (1<<b);
+        }
+
+        qsort(buffer,128, sizeof(uint32_t), uint32_cmp);
+
+        for (i = 0; i < 128; i++) {
+           buffer[i] = buffer[i] + prev;
+           prev = buffer[i];
+        }
+        for (i = 1; i < 128; i++) {
+        	if(buffer[i] < buffer[i-1] )
+        		buffer[i] = buffer[i-1];
+        }
+        assert(simdmaxbitsd1(init, buffer)<=b);
+        for (i = 0; i < 128; i++) {
+        	out[i] = 0; /* memset would do too */
+        }
+
+        /* delta-encode to 'i' bits */
+        simdpackwithoutmaskd1(init, buffer, (__m128i *)out, b);
+        simdunpackd1(init,  (__m128i *)out, backbuffer, b);
+
+        for (i = 0; i < 128; i++) {
+        	assert(buffer[i] == backbuffer[i]);
+        }
+
+        printf("advanced simdsearchd1: %d bits\n", b);
+
+        for (i = 0; i < 128; i++) {
+        	int pos;
+            initial = _mm_set1_epi32(init);
+        	pos = simdsearchd1(&initial, (__m128i *)out, b,
+                    buffer[i], &result);
+        	assert(pos == simdsearchwithlengthd1(init, (__m128i *)out, b, 128,
+                    buffer[i], &result));
+        	assert(buffer[pos] == buffer[i]);
+            if(pos > 0)
+            	assert(buffer[pos - 1] < buffer[i]);
+            assert(result == buffer[i]);
+        }
+        for (i = 0; i < 128; i++) {
+        	int pos;
+        	if(buffer[i] == 0) continue;
+        	initial = _mm_set1_epi32(init);
+        	pos = simdsearchd1(&initial, (__m128i *)out, b,
+                    buffer[i] - 1, &result);
+        	assert(pos == simdsearchwithlengthd1(init, (__m128i *)out, b, 128,
+                    buffer[i] - 1, &result));
+        	assert(buffer[pos] >= buffer[i]  - 1);
+            if(pos > 0)
+            	assert(buffer[pos - 1] < buffer[i]  - 1);
+            assert(result == buffer[pos]);
+        }
+		for (i = 0; i < 128; i++) {
+			int pos;
+			if (buffer[i] + 1 == 0)
+				continue;
+			initial = _mm_set1_epi32(init);
+			pos = simdsearchd1(&initial, (__m128i *) out, b,
+					buffer[i] + 1, &result);
+			assert(pos == simdsearchwithlengthd1(init, (__m128i *)out, b, 128,
+                    buffer[i] + 1, &result));
+			if(pos == 128) {
+				assert(buffer[i] == buffer[127]);
+			} else {
+			  assert(buffer[pos] >= buffer[i] + 1);
+			  if (pos > 0)
+				assert(buffer[pos - 1] < buffer[i] + 1);
+			  assert(result == buffer[pos]);
+			}
+		}
+    }
+    printf("advanced simdsearchd1: ok\n");
+    return 0;
+}
+
+int test_simdpackedselect() {
+    uint32_t buffer[128];
+    uint32_t initial = 33;
+    int b, i;
+
+    /* initialize the buffer */
+    for (i = 0; i < 128; i++)
+        buffer[i] = (uint32_t)(initial + i);
+
+    /* this test creates delta encoded buffers with different bits, then
+     * performs lower bound searches for each key */
+    for (b = 1; b <= 32; b++) {
+        uint32_t out[128];
+        /* delta-encode to 'i' bits */
+        simdpackwithoutmaskd1(initial, buffer, (__m128i *)out, b);
+
+        printf("simdselectd1: %d bits\n", b);
+
+        /* now perform the searches */
+        for (i = 0; i < 128; i++) {
+            assert(simdselectd1(initial, (__m128i *)out, b, (uint32_t)i)
+                            == initial + i);
+        }
+    }
+    printf("simdselectd1: ok\n");
+    return 0;
+}
+
+int test_simdpackedselect_advanced() {
+    uint32_t buffer[128];
+    uint32_t initial = 33;
+    uint32_t b;
+    int i;
+
+    /* this test creates delta encoded buffers with different bits, then
+     * performs lower bound searches for each key */
+    for (b = 0; b <= 32; b++) {
+        uint32_t prev = initial;
+    	uint32_t out[128];
+        /* initialize the buffer */
+        for (i = 0; i < 128; i++) {
+            buffer[i] =  ((uint32_t)(165576 * i)) ;
+            if(b < 32) buffer[i] %= (1<<b);
+        }
+        for (i = 0; i < 128; i++) {
+           buffer[i] = buffer[i] + prev;
+           prev = buffer[i];
+        }
+
+        for (i = 1; i < 128; i++) {
+        	if(buffer[i] < buffer[i-1] )
+        		buffer[i] = buffer[i-1];
+        }
+        assert(simdmaxbitsd1(initial, buffer)<=b);
+
+        for (i = 0; i < 128; i++) {
+        	out[i] = 0; /* memset would do too */
+        }
+
+        /* delta-encode to 'i' bits */
+        simdpackwithoutmaskd1(initial, buffer, (__m128i *)out, b);
+
+        printf("simdselectd1: %d bits\n", b);
+
+        /* now perform the searches */
+        for (i = 0; i < 128; i++) {
+        	uint32_t valretrieved = simdselectd1(initial, (__m128i *)out, b, (uint32_t)i);
+            assert(valretrieved == buffer[i]);
+        }
+    }
+    printf("advanced simdselectd1: ok\n");
+    return 0;
+}
+#endif
+
+
+int main() {
+    int r;
+    r =  testsetFOR();
+    if (r) {
+         printf("test failure 1\n");
+         return r;
+    }
+
+#ifdef __SSE4_1__
+    r =  testsetd1();
+    if (r) {
+         printf("test failure 2\n");
+         return r;
+    }
+#endif
+    r =  testset();
+    if (r) {
+         printf("test failure 3\n");
+         return r;
+    }
+
+    r = testshortFORpack();
+    if (r) {
+         printf("test failure 4\n");
+         return r;
+    }
+    r = testshortpack();
+    if (r) {
+         printf("test failure 5\n");
+         return r;
+    }
+    r = testlongpack();
+    if (r) {
+         printf("test failure 6\n");
+         return r;
+    }
+#ifdef __SSE4_1__
+    r = test_simdpackedsearchFOR();
+    if (r) {
+         printf("test failure 7\n");
+         return r;
+    }
+
+    r = testFOR();
+    if (r) {
+         printf("test failure 8\n");
+         return r;
+    }
+#endif
+#ifdef __AVX2__
+    r= testbabyavx();
+    if (r) {
+         printf("test failure baby avx\n");
+         return r;
+    }
+
+    r = testavx2();
+    if (r) {
+         printf("test failure 9 avx\n");
+         return r;
+    }
+#endif
+    r = test();
+    if (r) {
+         printf("test failure 9\n");
+         return r;
+    }
+
+    r = test_simdmaxbitsd1_length();
+    if (r) {
+         printf("test failure 10\n");
+         return r;
+    }
+#ifdef __SSE4_1__
+    r = test_simdpackedsearch();
+    if (r) {
+         printf("test failure 11\n");
+         return r;
+    }
+
+    r = test_simdpackedsearch_advanced();
+    if (r) {
+         printf("test failure 12\n");
+         return r;
+    }
+
+    r = test_simdpackedselect();
+    if (r) {
+         printf("test failure 13\n");
+         return r;
+    }
+
+    r = test_simdpackedselect_advanced();
+    if (r) {
+         printf("test failure 14\n");
+         return r;
+    }
+#endif
+    printf("All tests OK!\n");
+
+
+    return 0;
+}
--- a/cpp/simdcomp/tests/unit_chars.c
+++ b/cpp/simdcomp/tests/unit_chars.c
@@ -0,0 +1,102 @@
+/**
+ * This code is released under a BSD License.
+ */
+#include <stdio.h>
+#include <stdlib.h>
+#include <time.h>
+#include "simdcomp.h"
+
+
+#define get_random_char() (uint8_t)(rand() % 256);
+
+
+int main() {
+    int N = 5000 * SIMDBlockSize, gap;
+    __m128i * buffer = malloc(SIMDBlockSize * sizeof(uint32_t));
+    uint32_t * datain = malloc(N * sizeof(uint32_t));
+    uint32_t * backbuffer = malloc(SIMDBlockSize * sizeof(uint32_t));
+
+    srand(time(NULL));
+
+    for (gap = 1; gap <= 387420489; gap *= 3) {
+        int k;
+        printf(" gap = %u \n", gap);
+
+    /* simulate some random character string, don't care about endiannes */
+        for (k = 0; k < N; ++k) {
+        uint8_t _tmp[4];
+ 
+            _tmp[0] = get_random_char();
+            _tmp[1] = get_random_char();
+            _tmp[2] = get_random_char();
+            _tmp[3] = get_random_char();
+
+            memmove(&datain[k], _tmp, 4);
+        }
+        for (k = 0; k * SIMDBlockSize < N; ++k) {
+            /*
+               First part works for general arrays (sorted or unsorted)
+            */
+            int j;
+               /* we compute the bit width */
+            const uint32_t b = maxbits(datain + k * SIMDBlockSize);
+            /* we read 128 integers at "datain + k * SIMDBlockSize" and
+               write b 128-bit vectors at "buffer" */
+            simdpackwithoutmask(datain + k * SIMDBlockSize, buffer, b);
+            /* we read back b1 128-bit vectors at "buffer" and write 128 integers at backbuffer */
+            simdunpack(buffer, backbuffer, b);/* uncompressed */
+            for (j = 0; j < SIMDBlockSize; ++j) {
+                uint8_t chars_back[4];
+                uint8_t chars_in[4];
+
+                memmove(chars_back, &backbuffer[j], 4);
+                memmove(chars_in, &datain[k * SIMDBlockSize + j], 4);
+
+                if (chars_in[0] != chars_back[0]
+                    || chars_in[1] != chars_back[1]
+                    || chars_in[2] != chars_back[2]
+                    || chars_in[3] != chars_back[3]) {
+                    printf("bug in simdpack\n");
+                    return -2;
+                }
+            }
+
+            {
+                /*
+                 next part assumes that the data is sorted (uses differential coding)
+                */
+                uint32_t offset = 0;
+                /* we compute the bit width */
+                const uint32_t b1 = simdmaxbitsd1(offset,
+                datain + k * SIMDBlockSize);
+                   /* we read 128 integers at "datain + k * SIMDBlockSize" and
+                  write b1 128-bit vectors at "buffer" */
+                   simdpackwithoutmaskd1(offset, datain + k * SIMDBlockSize, buffer,
+                b1);
+                   /* we read back b1 128-bit vectors at "buffer" and write 128 integers at backbuffer */
+                   simdunpackd1(offset, buffer, backbuffer, b1);
+                for (j = 0; j < SIMDBlockSize; ++j) {
+                    uint8_t chars_back[4];
+                    uint8_t chars_in[4];
+
+                    memmove(chars_back, &backbuffer[j], 4);
+                    memmove(chars_in, &datain[k * SIMDBlockSize + j], 4);
+
+                    if (chars_in[0] != chars_back[0]
+                        || chars_in[1] != chars_back[1]
+                        || chars_in[2] != chars_back[2]
+                        || chars_in[3] != chars_back[3]) {
+                        printf("bug in simdpack\n");
+                        return -3;
+                    }
+                }
+                offset = datain[k * SIMDBlockSize + SIMDBlockSize - 1];
+            }
+        }
+    }
+    free(buffer);
+    free(datain);
+    free(backbuffer);
+    printf("Code looks good.\n");
+    return 0;
+}