Edited changelog and bumped version.

This branch is to be published as a hotfix for 5.1.0. Closes #280 Closes #274 Closes #289
Added comments from code review.
2026-01-01 23:12:54 +00:00 · 2018-05-05 21:00:10 -07:00 · 2018-05-05 20:51:40 -07:00 · 2018-05-02 23:32:27 -07:00 · 2018-05-02 22:33:38 -07:00 · 2018-05-02 11:42:07 -07:00
126 changed files with 99722 additions and 2321 deletions
--- a/.travis.yml
+++ b/.travis.yml
@@ -26,6 +26,7 @@ before_script:
  - export PATH=$HOME/.cargo/bin:$PATH
  - cargo install cargo-update || echo "cargo-update already installed"
  - cargo install cargo-travis || echo "cargo-travis already installed"
+  - cargo install-update -a # update outdated cached binaries
 script:
  - cargo build
  - cargo test
@@ -33,5 +34,5 @@ script:
  - cargo run --example simple_search
  - cargo doc
 after_success:
-  - cargo coveralls --exclude-pattern src/functional_test.rs
+  - cargo coveralls --exclude-pattern cpp/,src/functional_test.rs
  - cargo doc-upload
--- a/.vimrc
+++ b/.vimrc
@@ -0,0 +1,13 @@
+set wildignore+=*/examples/*
+
+set tabstop=2
+set shiftwidth=2
+set softtabstop=2
+set expandtab
+set nosmarttab
+
+set textwidth=100
+
+autocmd BufRead *.rs :setlocal tags=./rusty-tags.vi;/
+autocmd BufWritePost *.rs :silent! exec "!rusty-tags vi -o --quiet --start-dir=" . expand('%:p:h') . "&" | redraw!
+
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,9 +1,8 @@
 Tantivy 0.5.2
-==========================
-
- Removed C code. Tantivy is now pure Rust.
- BM25
- Approximate field norms encoded over 1 byte.
+===========================
+- bugfix #274
+- bugfix #280
+- bugfix #289

 Tantivy 0.5.1
 ==========================
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -1,7 +1,8 @@
 [package]
 name = "tantivy"
-version = "0.5.1"
+version = "0.5.2"
 authors = ["Paul Masurel <paul.masurel@gmail.com>"]
+build = "build.rs"
 license = "MIT"
 categories = ["database-implementations", "data-structures"]
 description = """Tantivy is a search engine library."""
@@ -16,17 +17,21 @@ byteorder = "1.0"
 lazy_static = "0.2.1"
 tinysegmenter = "0.1.0"
 regex = "0.2"
-fst = {version="0.2", default-features=false}
-atomicwrites = {version="0.1", optional=true}
+fst = "0.2"
+atomicwrites = "0.1.3"
+tempfile = "2.1"
 log = "0.3.6"
 combine = "2.2"
 tempdir = "0.3"
 serde = "1.0"
 serde_derive = "1.0"
 serde_json = "1.0"
+libc = { version = "0.2.20", optional=true }
 num_cpus = "1.2"
 itertools = "0.5.9"
+lz4 = "1.20"
 bit-set = "0.4.0"
+time = "0.1"
 uuid = { version = "0.6", features = ["v4", "serde"] }
 chan = "0.1"
 crossbeam = "0.3"
@@ -38,17 +43,17 @@ stable_deref_trait = "1.0.0"
 rust-stemmers = "0.1.0"
 downcast = { version="0.9", features = ["nightly"]}
 matches = "0.1"
-snap = "0.2"
-bitpacking = {path = "../bitpacking"}

 [target.'cfg(windows)'.dependencies]
 winapi = "0.2"

 [dev-dependencies]
 rand = "0.3"
-tempfile = "2.1"
 env_logger = "0.4"

+[build-dependencies]
+cc = { version="1.0.0", optional=true }
+
 [profile.release]
 opt-level = 3
 debug = false
@@ -57,23 +62,10 @@ debug-assertions = false


 [features]
-default = ["mmap"]
+default = ["simdcompression"]
+simdcompression = ["libc", "cc"]
 streamdict = []
-mmap = ["fst/mmap", "atomicwrites"]


 [badges]
 travis-ci = { repository = "tantivy-search/tantivy" }
-
-[[example]]
-name = "simple_search"
-required-features = ["mmap"]
-
-
-[[bin]]
-name = "convert_to_static"
-path = "./bin/convert_to_static.rs"
-
-[[bin]]
-name = "test_static_dir"
-path = "./bin/test_static_dir.rs"
--- a/README.md
+++ b/README.md
@@ -5,26 +5,25 @@
 [![Join the chat at https://gitter.im/tantivy-search/tantivy](https://badges.gitter.im/tantivy-search/tantivy.svg)](https://gitter.im/tantivy-search/tantivy?utm_source=badge&utm_medium=badge&utm_campaign=pr-badge&utm_content=badge)
 [![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](https://opensource.org/licenses/MIT)
 [![Build status](https://ci.appveyor.com/api/projects/status/r7nb13kj23u8m9pj?svg=true)](https://ci.appveyor.com/project/fulmicoton/tantivy)
+![beacon for google analytics](https://ga-beacon.appspot.com/UA-88834340-1/tantivy/README)

 **Tantivy** is a **full text search engine library** written in rust.

 It is strongly inspired by Lucene's design.

+
 # Features

- Tiny startup time (<10ms), perfect for command line tools
+- configurable indexing (optional term frequency and position indexing)
 - tf-idf scoring
 - Basic query language
 - Phrase queries
 - Incremental indexing
 - Multithreaded indexing (indexing English Wikipedia takes < 3 minutes on my desktop)
- Mmap directory
+- mmap based
 - optional SIMD integer compression
- Single valued and multivalued u64 and i64 fast fields (equivalent of doc values in Lucene)
+- u64 and i64 fast fields (equivalent of doc values in Lucene)
 - LZ4 compressed document store
- Range queries
- Faceting
- configurable indexing (optional term frequency and position indexing
 - Cheesy logo with a horse

 Tantivy supports Linux, MacOS and Windows.
@@ -41,38 +40,14 @@ It will walk you through getting a wikipedia search engine up and running in a f

 # Compiling

-## Development
-
-Tantivy requires Rust Nightly because it uses requires the features [`box_syntax`](https://doc.rust-lang.org/stable/unstable-book/language-features/box-syntax.html), [`optin_builtin_traits`](https://github.com/rust-lang/rfcs/blob/master/text/0019-opt-in-builtin-traits.md), [`conservative_impl_trait`](https://github.com/rust-lang/rfcs/blob/master/text/1522-conservative-impl-trait.md),
-and [simd](https://github.com/rust-lang/rust/issues/27731).
-
-
-To check out and run test, you can simply run :
+Tantivy requires Rust Nightly because it uses requires the features [`box_syntax`](https://doc.rust-lang.org/stable/book/box-syntax-and-patterns.html), [`optin_builtin_traits`](https://github.com/rust-lang/rfcs/blob/master/text/0019-opt-in-builtin-traits.md), and [`conservative_impl_trait`](https://github.com/rust-lang/rfcs/blob/master/text/1522-conservative-impl-trait.md).
+The project can then be built using `cargo`.

    git clone git@github.com:tantivy-search/tantivy.git
    cd tantivy
-    cargo +nightly build
+    cargo build


-## Note on release build and performance
-
-If your project depends on `tantivy`, for better performance, make sure to enable
-`sse3` instructions using a RUSTFLAGS. (This instruction set is likely to
-be available on most `x86_64` CPUs you will encounter).
-
-For instance,
-
-    RUSTFLAGS='-C target-feature=+sse3'
-
-Or, if you are targetting a specific cpu
-
-    RUSTFLAGS='-C target-cpu=native' build --release
-
-Regardless of the flags you pass, by default `tantivy` will contain `SSE3` instructions.
-If you want to disable those, you can run the following command :
-
-    cargo build --no-default-features
-
 Alternatively, if you are trying to compile `tantivy` without simd compression,
 you can disable this functionality. In this case, this submodule is not required
 and you can compile tantivy by using the `--no-default-features` flag.
@@ -82,4 +57,4 @@ and you can compile tantivy by using the `--no-default-features` flag.

 # Contribute

-Send me an email (paul.masurel at gmail.com) if you want to contribute to tantivy.
+Send me an email (paul.masurel at gmail.com) if you want to contribute to tantivy.
--- a/bin/convert_to_static.rs
+++ b/bin/convert_to_static.rs
@@ -1,20 +0,0 @@
-use std::env;
-use std::path::PathBuf;
-use std::fs::File;
-use std::io::Write;
-extern crate tantivy;
-use tantivy::directory::write_static_from_directory;
-
-fn main() {
-    // Prints each argument on a separate line
-    let  mut args = env::args();
-    args.next().unwrap();
-    let directory_path= args.next().expect("Expect 2 args.<directory_path> <outputfile>");
-    let output_path = args.next().expect("Expect 2 args.<directory_path> <outputfile>");
-    println!("{} => {}", directory_path, output_path);
-    let buffer = write_static_from_directory(&PathBuf::from(directory_path)).unwrap();
-    println!("Read all");
-    let mut output = File::create(output_path).unwrap();
-    output.write_all(&buffer[..]).unwrap();
-    output.flush().unwrap();
-}
--- a/bin/test_static_dir.rs
+++ b/bin/test_static_dir.rs
@@ -1,51 +0,0 @@
-use std::env;
-use std::path::PathBuf;
-use std::fs::File;
-use std::io::Write;
-extern crate tantivy;
-use tantivy::directory::{StaticDirectory, write_static_from_directory};
-use tantivy::Index;
-use tantivy::query::QueryParser;
-use tantivy::collector::TopCollector;
-
-
-static DATA: &'static [u8] = include_bytes!("output.bin");
-
-fn run() -> tantivy::Result<()> {
-    // Prints each argument on a separate line
-    let directory = StaticDirectory::open(DATA).unwrap();
-    let index = Index::open_directory(directory).unwrap();
-    index.load_searchers().unwrap();
-    let searcher = index.searcher();
-
-    let schema = index.schema();
-    let title = schema.get_field("title").unwrap();
-    let body = schema.get_field("body").unwrap();
-
-    let query_parser = QueryParser::for_index(&index, vec![title, body]);
-    let query = query_parser.parse_query("sea whale")?;
-
-    let mut top_collector = TopCollector::with_limit(10);
-
-    searcher.search(&*query, &mut top_collector)?;
-
-    let doc_addresses = top_collector.docs();
-
-    // The actual documents still need to be
-    // retrieved from Tantivy's store.
-    //
-    // Since the body field was not configured as stored,
-    // the document returned will only contain
-    // a title.
-
-    for doc_address in doc_addresses {
-        let retrieved_doc = searcher.doc(&doc_address)?;
-        println!("{}", schema.to_json(&retrieved_doc));
-    }
-    Ok(())
-}
-
-
-fn main() {
-    run().unwrap();
-}
--- a/build.rs
+++ b/build.rs
@@ -0,0 +1,61 @@
+#[cfg(feature = "simdcompression")]
+mod build {
+    extern crate cc;
+
+    pub fn build() {
+        let mut config = cc::Build::new();
+        config
+            .include("./cpp/simdcomp/include")
+            .file("cpp/simdcomp/src/avxbitpacking.c")
+            .file("cpp/simdcomp/src/simdintegratedbitpacking.c")
+            .file("cpp/simdcomp/src/simdbitpacking.c")
+            .file("cpp/simdcomp/src/simdpackedsearch.c")
+            .file("cpp/simdcomp/src/simdcomputil.c")
+            .file("cpp/simdcomp/src/simdpackedselect.c")
+            .file("cpp/simdcomp/src/simdfor.c")
+            .file("cpp/simdcomp_wrapper.c");
+
+        if !cfg!(debug_assertions) {
+            config.opt_level(3);
+
+            if cfg!(target_env = "msvc") {
+                config
+                    .define("NDEBUG", None)
+                    .flag("/Gm-")
+                    .flag("/GS-")
+                    .flag("/Gy")
+                    .flag("/Oi")
+                    .flag("/GL");
+            }
+        }
+
+        if !cfg!(target_env = "msvc") {
+            config
+                .include("./cpp/streamvbyte/include")
+                .file("cpp/streamvbyte/src/streamvbyte.c")
+                .file("cpp/streamvbyte/src/streamvbytedelta.c")
+                .flag("-msse4.1")
+                .flag("-march=native")
+                .flag("-std=c99");
+        }
+
+        config.compile("libsimdcomp.a");
+
+        // Workaround for linking static libraries built with /GL
+        // https://github.com/rust-lang/rust/issues/26003
+        if !cfg!(debug_assertions) && cfg!(target_env = "msvc") {
+            println!("cargo:rustc-link-lib=dylib=simdcomp");
+        }
+
+        println!("cargo:rerun-if-changed=cpp");
+    }
+}
+
+#[cfg(not(feature = "simdcompression"))]
+mod build {
+    pub fn build() {}
+}
+
+fn main() {
+    build::build();
+}
--- a/cpp/simdcomp/.gitignore
+++ b/cpp/simdcomp/.gitignore
@@ -0,0 +1,9 @@
+Makefile.in
+lib*
+unit*
+*.o
+src/*.lo
+src/*.o
+src/.deps
+src/.dirstamp
+src/.libs
--- a/cpp/simdcomp/.travis.yml
+++ b/cpp/simdcomp/.travis.yml
@@ -0,0 +1,11 @@
+language: c
+sudo: false
+compiler:
+  - gcc
+  - clang
+
+branches:
+  only:
+    - master
+
+script: make && ./unit
--- a/cpp/simdcomp/CHANGELOG
+++ b/cpp/simdcomp/CHANGELOG
@@ -0,0 +1,9 @@
+Upcoming
+  - added missing include
+  - improved portability (MSVC)
+  - implemented C89 compatibility
+Version 0.0.3 (19 May 2014)
+  - improved documentation
+Version 0.0.2 (6 February 2014)
+  - added go demo
+Version 0.0.1  (5 February 2014)
--- a/cpp/simdcomp/LICENSE
+++ b/cpp/simdcomp/LICENSE
@@ -0,0 +1,27 @@
+Copyright (c) 2014--, The authors
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without modification,
+are permitted provided that the following conditions are met:
+
+* Redistributions of source code must retain the above copyright notice, this
+  list of conditions and the following disclaimer.
+
+* Redistributions in binary form must reproduce the above copyright notice, this
+  list of conditions and the following disclaimer in the documentation and/or
+  other materials provided with the distribution.
+
+* Neither the name of the {organization} nor the names of its
+  contributors may be used to endorse or promote products derived from
+  this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR
+ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
+ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
--- a/cpp/simdcomp/README.md
+++ b/cpp/simdcomp/README.md
@@ -0,0 +1,137 @@
+The SIMDComp library
+====================
+[![Build Status](https://travis-ci.org/lemire/simdcomp.png)](https://travis-ci.org/lemire/simdcomp)
+
+A simple C library for compressing lists of integers using binary packing and SIMD instructions.
+The assumption is either that you have a list of 32-bit integers where most of them are small, or a list of 32-bit integers where differences between successive integers are small. No software is able to reliably compress an array of 32-bit random numbers.
+
+This library can decode at least 4 billions of compressed integers per second on most
+desktop or laptop processors. That is, it can decompress data at a rate of 15 GB/s.
+This is significantly faster than generic codecs like gzip, LZO, Snappy or LZ4.
+
+On a Skylake Intel processor, it can decode integers at a rate 0.3 cycles per integer,
+which can easily translate into more than 8 decoded billions integers per second.
+
+Contributors: Daniel Lemire, Nathan Kurz, Christoph Rupp, Anatol Belski, Nick White and others
+
+What is it for?
+-------------
+
+This is a low-level library for fast integer compression. By design it does not define a compressed
+format. It is up to the (sophisticated) user to create a compressed format.
+
+Requirements
+-------------
+
+- Your processor should support SSE4.1 (It is supported by most Intel and AMD processors released since 2008.)
+- It is possible to build the core part of the code if your processor support SSE2 (Pentium4 or better)
+- C99 compliant compiler (GCC is assumed)
+- A Linux-like distribution is assumed by the makefile
+
+For a plain C version that does not use SIMD instructions, see https://github.com/lemire/LittleIntPacker
+
+Usage
+-------
+
+Compression works over blocks of 128 integers.
+
+For a complete working example, see example.c (you can build it and
+run it with "make example; ./example").
+
+
+
+1) Lists of integers in random order.
+
+```C            
+const uint32_t b = maxbits(datain);// computes bit width
+simdpackwithoutmask(datain, buffer, b);//compressed to buffer, compressing 128 32-bit integers down to b*32 bytes
+simdunpack(buffer, backbuffer, b);//uncompressed to backbuffer
+```
+
+While 128 32-bit integers are read, only b 128-bit words are written. Thus, the compression ratio is 32/b.
+
+2) Sorted lists of integers.
+
+We used differential coding: we store the difference between successive integers. For this purpose, we need an initial value (called offset).
+
+```C            
+uint32_t offset = 0;
+uint32_t b1 = simdmaxbitsd1(offset,datain); // bit width
+simdpackwithoutmaskd1(offset, datain, buffer, b1);//compressing 128 32-bit integers down to b1*32 bytes
+simdunpackd1(offset, buffer, backbuffer, b1);//uncompressed
+```
+
+General example for arrays of arbitrary length:
+```C
+int compress_decompress_demo() {
+  size_t k, N = 9999;
+  __m128i * endofbuf;
+  uint32_t * datain = malloc(N * sizeof(uint32_t));
+  uint8_t * buffer;
+  uint32_t * backbuffer = malloc(N * sizeof(uint32_t));
+  uint32_t b;
+
+  for (k = 0; k < N; ++k){        /* start with k=0, not k=1! */
+    datain[k] = k;
+  }
+
+  b = maxbits_length(datain, N);
+  buffer = malloc(simdpack_compressedbytes(N,b)); // allocate just enough memory
+  endofbuf = simdpack_length(datain, N, (__m128i *)buffer, b);
+  /* compressed data is stored between buffer and endofbuf using (endofbuf-buffer)*sizeof(__m128i) bytes */
+  /* would be safe to do : buffer = realloc(buffer,(endofbuf-(__m128i *)buffer)*sizeof(__m128i)); */
+  simdunpack_length((const __m128i *)buffer, N, backbuffer, b);
+
+  for (k = 0; k < N; ++k){
+    if(datain[k] != backbuffer[k]) {
+      printf("bug\n");
+      return -1;
+    }
+  }
+  return 0;
+}
+```
+
+
+3) Frame-of-Reference 
+
+We also have frame-of-reference (FOR) functions (see simdfor.h header). They work like the bit packing
+routines, but do not use differential coding so they allow faster search in some cases, at the expense
+of compression.
+
+Setup
+---------
+
+
+make
+make test
+
+and if you are daring:
+
+make install
+
+Go
+--------
+
+If you are a go user, there is a "go" folder where you will find a simple demo.
+
+Other libraries
+----------------
+
+* Fast decoder for VByte-compressed integers https://github.com/lemire/MaskedVByte
+* Fast integer compression in C using StreamVByte https://github.com/lemire/streamvbyte
+* FastPFOR is a C++ research library well suited to compress unsorted arrays: https://github.com/lemire/FastPFor
+* SIMDCompressionAndIntersection is a C++ research library well suited for sorted arrays (differential coding)
+and computing intersections: https://github.com/lemire/SIMDCompressionAndIntersection
+* TurboPFor is a C library that offers lots of interesting optimizations. Well worth checking! (GPL license) https://github.com/powturbo/TurboPFor
+* Oroch is a C++ library that offers a usable API (MIT license) https://github.com/ademakov/Oroch
+
+
+References
+------------
+
+* Daniel Lemire, Leonid Boytsov, Nathan Kurz, SIMD Compression and the Intersection of Sorted Integers, Software Practice & Experience 46 (6) 2016. http://arxiv.org/abs/1401.6399
+* Daniel Lemire and Leonid Boytsov, Decoding billions of integers per second through vectorization, Software Practice & Experience 45 (1), 2015.  http://arxiv.org/abs/1209.2137 http://onlinelibrary.wiley.com/doi/10.1002/spe.2203/abstract
+* Jeff Plaisance, Nathan Kurz, Daniel Lemire, Vectorized VByte Decoding, International Symposium on Web Algorithms 2015, 2015. http://arxiv.org/abs/1503.07387
+* Wayne Xin Zhao, Xudong Zhang, Daniel Lemire, Dongdong Shan, Jian-Yun Nie, Hongfei Yan, Ji-Rong Wen, A General SIMD-based Approach to Accelerating Compression Algorithms, ACM Transactions on Information Systems 33 (3), 2015. http://arxiv.org/abs/1502.01916
+* T. D. Wu, Bitpacking techniques for indexing genomes: I. Hash tables, Algorithms for Molecular Biology 11 (5), 2016. http://almob.biomedcentral.com/articles/10.1186/s13015-016-0069-5
--- a/cpp/simdcomp/benchmarks/benchmark.c
+++ b/cpp/simdcomp/benchmarks/benchmark.c
@@ -0,0 +1,235 @@
+/**
+ * This code is released under a BSD License.
+ */
+#include <assert.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <time.h>
+
+#include "simdcomp.h"
+
+#ifdef _MSC_VER
+# include <windows.h>
+
+__int64 freq;
+
+typedef __int64 time_snap_t;
+
+static time_snap_t time_snap(void)
+{
+	__int64 now;
+
+	QueryPerformanceCounter((LARGE_INTEGER *)&now);
+
+	return (__int64)((now*1000000)/freq);
+}
+# define TIME_SNAP_FMT "%I64d"
+#else
+# define time_snap clock
+# define TIME_SNAP_FMT "%lu"
+typedef clock_t time_snap_t;
+#endif
+
+
+void benchmarkSelect() {
+    uint32_t buffer[128];
+    uint32_t backbuffer[128];
+    uint32_t initial = 33;
+    uint32_t b;
+    time_snap_t S1, S2, S3;
+    int i;
+    printf("benchmarking select \n");
+
+    /* this test creates delta encoded buffers with different bits, then
+     * performs lower bound searches for each key */
+    for (b = 0; b <= 32; b++) {
+        uint32_t prev = initial;
+        uint32_t out[128];
+        /* initialize the buffer */
+        for (i = 0; i < 128; i++) {
+            buffer[i] =  ((uint32_t)(1655765 * i )) ;
+            if(b < 32) buffer[i] %= (1<<b);
+        }
+        for (i = 0; i < 128; i++) {
+            buffer[i] = buffer[i] + prev;
+            prev = buffer[i];
+        }
+
+        for (i = 1; i < 128; i++) {
+            if(buffer[i] < buffer[i-1] )
+                buffer[i] = buffer[i-1];
+        }
+        assert(simdmaxbitsd1(initial, buffer)<=b);
+
+        for (i = 0; i < 128; i++) {
+            out[i] = 0; /* memset would do too */
+        }
+
+        /* delta-encode to 'i' bits */
+        simdpackwithoutmaskd1(initial, buffer, (__m128i *)out, b);
+
+        S1 = time_snap();
+        for (i = 0; i < 128 * 10; i++) {
+            uint32_t valretrieved = simdselectd1(initial, (__m128i *)out, b, (uint32_t)i % 128);
+            assert(valretrieved == buffer[i%128]);
+        }
+        S2 = time_snap();
+        for (i = 0; i < 128 * 10; i++) {
+            simdunpackd1(initial,  (__m128i *)out, backbuffer, b);
+            assert(backbuffer[i % 128] == buffer[i % 128]);
+        }
+        S3 = time_snap();
+        printf("bit width = %d, fast select function time = " TIME_SNAP_FMT ", naive time = " TIME_SNAP_FMT "  \n", b, (S2-S1), (S3-S2));
+    }
+}
+
+int uint32_cmp(const void *a, const void *b)
+{
+    const uint32_t *ia = (const uint32_t *)a;
+    const uint32_t *ib = (const uint32_t *)b;
+    if(*ia < *ib)
+        return -1;
+    else if (*ia > *ib)
+        return 1;
+    return 0;
+}
+
+/* adapted from wikipedia */
+int binary_search(uint32_t * A, uint32_t key, int imin, int imax)
+{
+    int imid;
+    imax --;
+    while(imin + 1 < imax) {
+        imid = imin + ((imax - imin) / 2);
+
+        if (A[imid] > key) {
+            imax = imid;
+        } else if (A[imid] < key) {
+            imin = imid;
+        } else {
+            return imid;
+        }
+    }
+    return imax;
+}
+
+
+/* adapted from wikipedia */
+int lower_bound(uint32_t * A, uint32_t key, int imin, int imax)
+{
+    int imid;
+    imax --;
+    while(imin + 1 < imax) {
+        imid = imin + ((imax - imin) / 2);
+
+        if (A[imid] >= key) {
+            imax = imid;
+        } else if (A[imid] < key) {
+            imin = imid;
+        }
+    }
+    if(A[imin] >= key) return imin;
+    return imax;
+}
+
+void benchmarkSearch() {
+    uint32_t buffer[128];
+    uint32_t backbuffer[128];
+    uint32_t out[128];
+    uint32_t result, initial = 0;
+    uint32_t b, i;
+    time_snap_t S1, S2, S3, S4;
+
+    printf("benchmarking search \n");
+
+    /* this test creates delta encoded buffers with different bits, then
+     * performs lower bound searches for each key */
+    for (b = 0; b <= 32; b++) {
+        uint32_t prev = initial;
+        /* initialize the buffer */
+        for (i = 0; i < 128; i++) {
+            buffer[i] =  ((uint32_t)rand()) ;
+            if(b < 32) buffer[i] %= (1<<b);
+        }
+
+        qsort(buffer,128, sizeof(uint32_t), uint32_cmp);
+
+        for (i = 0; i < 128; i++) {
+            buffer[i] = buffer[i] + prev;
+            prev = buffer[i];
+        }
+        for (i = 1; i < 128; i++) {
+            if(buffer[i] < buffer[i-1] )
+                buffer[i] = buffer[i-1];
+        }
+        assert(simdmaxbitsd1(initial, buffer)<=b);
+        for (i = 0; i < 128; i++) {
+            out[i] = 0; /* memset would do too */
+        }
+
+        /* delta-encode to 'i' bits */
+        simdpackwithoutmaskd1(initial, buffer, (__m128i *)out, b);
+        simdunpackd1(initial,  (__m128i *)out, backbuffer, b);
+
+        for (i = 0; i < 128; i++) {
+            assert(buffer[i] == backbuffer[i]);
+         }
+        S1 = time_snap();
+        for (i = 0; i < 128 * 10; i++) {
+
+            int pos;
+            uint32_t pseudorandomkey  =  buffer[i%128];
+            __m128i vecinitial = _mm_set1_epi32(initial);
+            pos = simdsearchd1(&vecinitial, (__m128i *)out, b,
+                               pseudorandomkey, &result);
+            if((result < pseudorandomkey) || (buffer[pos] != result)) {
+                printf("bug A.\n");
+            } else if (pos > 0) {
+                if(buffer[pos-1] >= pseudorandomkey)
+                    printf("bug B.\n");
+            }
+        }
+        S2 = time_snap();
+        for (i = 0; i < 128 * 10; i++) {
+            int pos;
+            uint32_t pseudorandomkey  =  buffer[i%128];
+            simdunpackd1(initial,  (__m128i *)out, backbuffer, b);
+            pos =  lower_bound(backbuffer, pseudorandomkey, 0, 128);
+            result = backbuffer[pos];
+
+            if((result < pseudorandomkey) || (buffer[pos] != result)) {
+                printf("bug C.\n");
+            } else if (pos > 0) {
+                if(buffer[pos-1] >= pseudorandomkey)
+                    printf("bug D.\n");
+            }
+        }
+        S3 = time_snap();
+        for (i = 0; i < 128 * 10; i++) {
+
+            int pos;
+            uint32_t pseudorandomkey  =  buffer[i%128];
+            pos = simdsearchwithlengthd1(initial, (__m128i *)out, b, 128,
+                               pseudorandomkey, &result);
+            if((result < pseudorandomkey) || (buffer[pos] != result)) {
+                printf("bug A.\n");
+            } else if (pos > 0) {
+                if(buffer[pos-1] >= pseudorandomkey)
+                    printf("bug B.\n");
+            }
+        }
+        S4 = time_snap();
+
+        printf("bit width = %d, fast search function time = " TIME_SNAP_FMT ", naive time = " TIME_SNAP_FMT " , fast with length time = " TIME_SNAP_FMT "  \n", b, (S2-S1), (S3-S2), (S4-S3) );
+    }
+}
+
+
+int main() {
+#ifdef _MSC_VER
+    QueryPerformanceFrequency((LARGE_INTEGER *)&freq);
+#endif
+    benchmarkSearch();
+    benchmarkSelect();
+    return 0;
+}
--- a/cpp/simdcomp/benchmarks/bitpackingbenchmark.c
+++ b/cpp/simdcomp/benchmarks/bitpackingbenchmark.c
@@ -0,0 +1,205 @@
+#include <stdio.h>
+
+#include "simdcomp.h"
+
+
+#define RDTSC_START(cycles)                                                   \
+    do {                                                                      \
+        register unsigned cyc_high, cyc_low;                                  \
+        __asm volatile(                                                       \
+            "cpuid\n\t"                                                       \
+            "rdtsc\n\t"                                                       \
+            "mov %%edx, %0\n\t"                                               \
+            "mov %%eax, %1\n\t"                                               \
+            : "=r"(cyc_high), "=r"(cyc_low)::"%rax", "%rbx", "%rcx", "%rdx"); \
+        (cycles) = ((uint64_t)cyc_high << 32) | cyc_low;                      \
+    } while (0)
+
+#define RDTSC_FINAL(cycles)                                                   \
+    do {                                                                      \
+        register unsigned cyc_high, cyc_low;                                  \
+        __asm volatile(                                                       \
+            "rdtscp\n\t"                                                      \
+            "mov %%edx, %0\n\t"                                               \
+            "mov %%eax, %1\n\t"                                               \
+            "cpuid\n\t"                                                       \
+            : "=r"(cyc_high), "=r"(cyc_low)::"%rax", "%rbx", "%rcx", "%rdx"); \
+        (cycles) = ((uint64_t)cyc_high << 32) | cyc_low;                      \
+    } while (0)
+
+
+
+
+uint32_t * get_random_array_from_bit_width(uint32_t length, uint32_t bit) {
+    uint32_t * answer = malloc(sizeof(uint32_t) * length);
+    uint32_t mask = (uint32_t) ((UINT64_C(1) << bit) - 1);
+    uint32_t i;
+    for(i = 0; i < length; ++i) {
+        answer[i] = rand() & mask;
+    }
+    return answer;
+}
+
+uint32_t * get_random_array_from_bit_width_d1(uint32_t length, uint32_t bit) {
+    uint32_t * answer = malloc(sizeof(uint32_t) * length);
+    uint32_t mask = (uint32_t) ((UINT64_C(1) << bit) - 1);
+    uint32_t i;
+    answer[0] = rand() & mask;
+    for(i = 1; i < length; ++i) {
+        answer[i] = answer[i-1] + (rand() & mask);
+    }
+    return answer;
+}
+
+
+void demo128() {
+    const uint32_t length = 128;
+    uint32_t bit;
+    printf("# --- %s\n", __func__);
+    printf("# compressing %d integers\n",length);
+    printf("# format: bit width, pack in cycles per int, unpack in cycles per int\n");
+    for(bit = 1; bit <= 32; ++bit) {
+        uint32_t i;
+
+        uint32_t * data = get_random_array_from_bit_width(length, bit);
+        __m128i * buffer = malloc(length * sizeof(uint32_t));
+        uint32_t * backdata = malloc(length * sizeof(uint32_t));
+        uint32_t repeat = 500;
+        uint64_t min_diff;
+        printf("%d\t",bit);
+        min_diff = (uint64_t)-1;
+        for (i = 0; i < repeat; i++) {
+            uint64_t cycles_start, cycles_final, cycles_diff;
+            __asm volatile("" ::: /* pretend to clobber */ "memory");
+            RDTSC_START(cycles_start);
+            simdpackwithoutmask(data,buffer, bit);
+            RDTSC_FINAL(cycles_final);
+            cycles_diff = (cycles_final - cycles_start);
+            if (cycles_diff < min_diff) min_diff = cycles_diff;
+        }
+        printf("%.2f\t",min_diff*1.0/length);
+        min_diff = (uint64_t)-1;
+        for (i = 0; i < repeat; i++) {
+            uint64_t cycles_start, cycles_final, cycles_diff;
+            __asm volatile("" ::: /* pretend to clobber */ "memory");
+            RDTSC_START(cycles_start);
+            simdunpack(buffer, backdata,bit);
+            RDTSC_FINAL(cycles_final);
+            cycles_diff = (cycles_final - cycles_start);
+            if (cycles_diff < min_diff) min_diff = cycles_diff;
+        }
+        printf("%.2f\t",min_diff*1.0/length);
+
+        free(data);
+        free(buffer);
+        free(backdata);
+        printf("\n");
+    }
+    printf("\n\n"); /* two blank lines are required by gnuplot */
+}
+
+void demo128_d1() {
+    const uint32_t length = 128;
+    uint32_t bit;
+    printf("# --- %s\n", __func__);
+    printf("# compressing %d integers\n",length);
+    printf("# format: bit width, pack in cycles per int, unpack in cycles per int\n");
+    for(bit = 1; bit <= 32; ++bit) {
+        uint32_t i;
+
+        uint32_t * data = get_random_array_from_bit_width_d1(length, bit);
+        __m128i * buffer = malloc(length * sizeof(uint32_t));
+        uint32_t * backdata = malloc(length * sizeof(uint32_t));
+        uint32_t repeat = 500;
+        uint64_t min_diff;
+        printf("%d\t",bit);
+        min_diff = (uint64_t)-1;
+        for (i = 0; i < repeat; i++) {
+            uint64_t cycles_start, cycles_final, cycles_diff;
+            __asm volatile("" ::: /* pretend to clobber */ "memory");
+            RDTSC_START(cycles_start);
+            simdpackwithoutmaskd1(0,data,buffer, bit);
+            RDTSC_FINAL(cycles_final);
+            cycles_diff = (cycles_final - cycles_start);
+            if (cycles_diff < min_diff) min_diff = cycles_diff;
+        }
+        printf("%.2f\t",min_diff*1.0/length);
+        min_diff = (uint64_t)-1;
+        for (i = 0; i < repeat; i++) {
+            uint64_t cycles_start, cycles_final, cycles_diff;
+            __asm volatile("" ::: /* pretend to clobber */ "memory");
+            RDTSC_START(cycles_start);
+            simdunpackd1(0,buffer, backdata,bit);
+            RDTSC_FINAL(cycles_final);
+            cycles_diff = (cycles_final - cycles_start);
+            if (cycles_diff < min_diff) min_diff = cycles_diff;
+        }
+        printf("%.2f\t",min_diff*1.0/length);
+
+        free(data);
+        free(buffer);
+        free(backdata);
+        printf("\n");
+    }
+    printf("\n\n"); /* two blank lines are required by gnuplot */
+}
+
+#ifdef __AVX2__
+void demo256() {
+    const uint32_t length = 256;
+    uint32_t bit;
+    printf("# --- %s\n", __func__);
+    printf("# compressing %d integers\n",length);
+    printf("# format: bit width, pack in cycles per int, unpack in cycles per int\n");
+    for(bit = 1; bit <= 32; ++bit) {
+        uint32_t i;
+
+        uint32_t * data = get_random_array_from_bit_width(length, bit);
+        __m256i * buffer = malloc(length * sizeof(uint32_t));
+        uint32_t * backdata = malloc(length * sizeof(uint32_t));
+        uint32_t repeat = 500;
+        uint64_t min_diff;
+        printf("%d\t",bit);
+        min_diff = (uint64_t)-1;
+        for (i = 0; i < repeat; i++) {
+            uint64_t cycles_start, cycles_final, cycles_diff;
+            __asm volatile("" ::: /* pretend to clobber */ "memory");
+            RDTSC_START(cycles_start);
+            avxpackwithoutmask(data,buffer, bit);
+            RDTSC_FINAL(cycles_final);
+            cycles_diff = (cycles_final - cycles_start);
+            if (cycles_diff < min_diff) min_diff = cycles_diff;
+        }
+        printf("%.2f\t",min_diff*1.0/length);
+        min_diff = (uint64_t)-1;
+        for (i = 0; i < repeat; i++) {
+            uint64_t cycles_start, cycles_final, cycles_diff;
+            __asm volatile("" ::: /* pretend to clobber */ "memory");
+            RDTSC_START(cycles_start);
+            avxunpack(buffer, backdata,bit);
+            RDTSC_FINAL(cycles_final);
+            cycles_diff = (cycles_final - cycles_start);
+            if (cycles_diff < min_diff) min_diff = cycles_diff;
+        }
+        printf("%.2f\t",min_diff*1.0/length);
+
+        free(data);
+        free(buffer);
+        free(backdata);
+        printf("\n");
+    }
+    printf("\n\n"); /* two blank lines are required by gnuplot */
+}
+#endif /* avx 2 */
+
+
+int main() {
+    demo128();
+    demo128_d1();
+#ifdef __AVX2__
+    demo256();
+#endif
+    return 0;
+
+
+}
--- a/cpp/simdcomp/example.c
+++ b/cpp/simdcomp/example.c
@@ -0,0 +1,195 @@
+/* Type "make example" to build this example program. */
+#include <stdio.h>
+#include <time.h>
+#include <stdlib.h>
+#include "simdcomp.h"
+
+/**
+We provide several different code examples.
+**/
+
+
+/* very simple test to illustrate a simple application */
+int compress_decompress_demo() {
+    size_t k, N = 9999;
+    __m128i * endofbuf;
+    int howmanybytes;
+    float compratio;
+    uint32_t * datain = malloc(N * sizeof(uint32_t));
+    uint8_t * buffer;
+    uint32_t * backbuffer = malloc(N * sizeof(uint32_t));
+    uint32_t b;
+    printf("== simple test\n");
+
+    for (k = 0; k < N; ++k) {       /* start with k=0, not k=1! */
+        datain[k] = k;
+    }
+
+    b = maxbits_length(datain, N);
+    buffer = malloc(simdpack_compressedbytes(N,b));
+    endofbuf = simdpack_length(datain, N, (__m128i *)buffer, b);
+    howmanybytes = (endofbuf-(__m128i *)buffer)*sizeof(__m128i); /* number of compressed bytes */
+    compratio = N*sizeof(uint32_t) * 1.0 / howmanybytes;
+    /* endofbuf points to the end of the compressed data */
+    buffer = realloc(buffer,(endofbuf-(__m128i *)buffer)*sizeof(__m128i)); /* optional but safe. */
+    printf("Compressed %d integers down to %d bytes (comp. ratio = %f).\n",(int)N,howmanybytes,compratio);
+    /* in actual applications b must be stored and retrieved: caller is responsible for that. */
+    simdunpack_length((const __m128i *)buffer, N, backbuffer, b); /* will return a pointer to endofbuf */ 
+
+    for (k = 0; k < N; ++k) {
+        if(datain[k] != backbuffer[k]) {
+            printf("bug at %lu \n",(unsigned long)k);
+            return -1;
+        }
+    }
+    printf("Code works!\n");
+    free(datain);
+    free(buffer);
+    free(backbuffer);
+    return 0;
+}
+
+
+
+/* compresses data from datain to buffer, returns how many bytes written
+used below in simple_demo */
+size_t compress(uint32_t * datain, size_t length, uint8_t * buffer) {
+    uint32_t offset;
+    uint8_t * initout;
+    size_t k;
+    if(length/SIMDBlockSize*SIMDBlockSize != length) {
+        printf("Data length should be a multiple of %i \n",SIMDBlockSize);
+    }
+    offset = 0;
+    initout = buffer;
+    for(k = 0; k < length / SIMDBlockSize; ++k) {
+        uint32_t b = simdmaxbitsd1(offset,
+                                   datain + k * SIMDBlockSize);
+        *buffer++ = b;
+        simdpackwithoutmaskd1(offset, datain + k * SIMDBlockSize, (__m128i *) buffer,
+                              b);
+        offset = datain[k * SIMDBlockSize + SIMDBlockSize - 1];
+        buffer += b * sizeof(__m128i);
+    }
+    return buffer - initout;
+}
+
+/* Another illustration ... */
+void simple_demo() {
+    size_t REPEAT = 10, gap;
+    size_t N = 1000 * SIMDBlockSize;/* SIMDBlockSize is 128 */
+    uint32_t * datain = malloc(N * sizeof(uint32_t));
+    size_t compsize;
+    clock_t start, end;
+    uint8_t * buffer = malloc(N * sizeof(uint32_t) + N / SIMDBlockSize); /* output buffer */
+    uint32_t * backbuffer = malloc(SIMDBlockSize * sizeof(uint32_t));
+    printf("== simple demo\n");
+    for (gap = 1; gap <= 243; gap *= 3) {
+        size_t k, repeat;
+        uint32_t offset = 0;
+        uint32_t bogus = 0;
+        double numberofseconds;
+
+        printf("\n");
+        printf(" gap = %lu \n", (unsigned long) gap);
+        datain[0] = 0;
+        for (k = 1; k < N; ++k)
+            datain[k] = datain[k-1] + ( rand() % (gap + 1) );
+        compsize = compress(datain,N,buffer);
+        printf("compression ratio = %f \n",  (N * sizeof(uint32_t))/ (compsize * 1.0 ));
+        start = clock();
+        for(repeat = 0; repeat < REPEAT; ++repeat) {
+            uint8_t * decbuffer = buffer;
+            for (k = 0; k * SIMDBlockSize < N; ++k) {
+                uint8_t b = *decbuffer++;
+                simdunpackd1(offset, (__m128i *) decbuffer, backbuffer, b);
+                /* do something here with backbuffer */
+                bogus += backbuffer[3];
+                decbuffer += b * sizeof(__m128i);
+                offset = backbuffer[SIMDBlockSize - 1];
+            }
+        }
+        end = clock();
+        numberofseconds = (end-start)/(double)CLOCKS_PER_SEC;
+        printf("decoding speed in million of integers per second %f \n",N*REPEAT/(numberofseconds*1000.0*1000.0));
+        start = clock();
+        for(repeat = 0; repeat < REPEAT; ++repeat) {
+            uint8_t * decbuffer = buffer;
+            for (k = 0; k * SIMDBlockSize < N; ++k) {
+                memcpy(backbuffer,decbuffer+k*SIMDBlockSize,SIMDBlockSize*sizeof(uint32_t));
+                bogus += backbuffer[3] - backbuffer[100];
+            }
+        }
+        end = clock();
+        numberofseconds = (end-start)/(double)CLOCKS_PER_SEC;
+        printf("memcpy speed in million of integers per second %f \n",N*REPEAT/(numberofseconds*1000.0*1000.0));
+        printf("ignore me %i \n",bogus);
+        printf("All tests are in CPU cache. Avoid out-of-cache decoding in applications.\n");
+    }
+    free(buffer);
+    free(datain);
+    free(backbuffer);
+}
+
+/* Used below in more_sophisticated_demo ... */
+size_t varying_bit_width_compress(uint32_t * datain, size_t length, uint8_t * buffer) {
+    uint8_t * initout;
+    size_t k;
+    if(length/SIMDBlockSize*SIMDBlockSize != length) {
+        printf("Data length should be a multiple of %i \n",SIMDBlockSize);
+    }
+    initout = buffer;
+    for(k = 0; k < length / SIMDBlockSize; ++k) {
+        uint32_t b = maxbits(datain);
+        *buffer++ = b;
+        simdpackwithoutmask(datain, (__m128i *)buffer, b);
+        datain += SIMDBlockSize;
+        buffer += b * sizeof(__m128i);
+    }
+    return buffer - initout;
+}
+
+/* Here we compress the data in blocks of 128 integers with varying bit width */
+int varying_bit_width_demo() {
+    size_t nn = 128 * 2;
+    uint32_t * datainn = malloc(nn * sizeof(uint32_t));
+    uint8_t * buffern = malloc(nn * sizeof(uint32_t) + nn / SIMDBlockSize);
+    uint8_t * initbuffern = buffern;
+    uint32_t * backbuffern = malloc(nn * sizeof(uint32_t));
+    size_t k, compsize;
+    printf("== varying bit-width demo\n");
+
+    for(k=0; k<nn; ++k) {
+        datainn[k] = rand() % (k + 1);
+    }
+
+    compsize = varying_bit_width_compress(datainn,nn,buffern);
+    printf("encoded size: %u (original size: %u)\n", (unsigned)compsize,
+           (unsigned)(nn * sizeof(uint32_t)));
+
+    for (k = 0; k * SIMDBlockSize < nn; ++k) {
+        uint32_t b = *buffern;
+        buffern++;
+        simdunpack((const __m128i *)buffern, backbuffern + k * SIMDBlockSize, b);
+        buffern += b * sizeof(__m128i);
+    }
+
+    for (k = 0; k < nn; ++k) {
+        if(backbuffern[k] != datainn[k]) {
+            printf("bug\n");
+            return -1;
+        }
+    }
+    printf("Code works!\n");
+    free(datainn);
+    free(initbuffern);
+    free(backbuffern);
+    return 0;
+}
+
+int main() {
+    if(compress_decompress_demo() != 0) return -1;
+    if(varying_bit_width_demo() != 0) return -1;
+    simple_demo();
+    return 0;
+}
--- a/cpp/simdcomp/go/README.md
+++ b/cpp/simdcomp/go/README.md
@@ -0,0 +1,13 @@
+Simple Go demo
+==============
+
+Setup
+======
+
+Start by installing the simdcomp library (make && make install).
+
+Then type:
+
+go run test.go
+
+
--- a/cpp/simdcomp/go/test.go
+++ b/cpp/simdcomp/go/test.go
@@ -0,0 +1,71 @@
+/////////
+// This particular file is in the public domain.
+// Author: Daniel Lemire
+////////
+
+package main 
+
+/*
+#cgo LDFLAGS: -lsimdcomp
+#include <simdcomp.h>
+*/
+import "C"
+import "fmt"
+
+//////////
+// For this demo, we pack and unpack blocks of 128 integers
+/////////
+func main() {
+        // I am going to use C types. Alternative might be to use unsafe.Pointer calls, see http://bit.ly/1ndw3W3
+        // this is our original data
+        var data [128]C.uint32_t
+        for i := C.uint32_t(0); i < C.uint32_t(128); i++ {
+            data[i] = i
+        }
+
+
+
+
+
+        ////////////
+        // We first pack without differential coding
+        ///////////
+        // computing how many bits per int. is needed
+        b  := C.maxbits(&data[0])
+        ratio := 32.0/float64(b)
+        fmt.Println("Bit width  ", b)
+        fmt.Println(fmt.Sprintf("Compression ratio %f ", ratio))
+         // we are now going to create a buffer to receive the packed data (each __m128i uses 128 bits)
+        out := make([] C.__m128i,b)       
+        C.simdpackwithoutmask( &data[0],&out[0],b);
+        var recovereddata [128]C.uint32_t
+        C.simdunpack(&out[0],&recovereddata[0],b)
+        for i := 0; i < 128; i++ {
+            if data[i] != recovereddata[i]  {
+                  fmt.Println("Bug ")
+                  return
+            }
+        } 
+
+        ///////////
+        // Next, we use differential coding
+        //////////
+        offset := C.uint32_t(0) // if you pack data from K to K + 128, offset should be the value at K-1. When K = 0, choose a default
+        b1  := C.simdmaxbitsd1(offset,&data[0])
+        ratio1 := 32.0/float64(b1)
+        fmt.Println("Bit width  ", b1)
+        fmt.Println(fmt.Sprintf("Compression ratio %f ", ratio1))
+         // we are now going to create a buffer to receive the packed data (each __m128i uses 128 bits)
+        out = make([] C.__m128i,b1)       
+        C.simdpackwithoutmaskd1(offset, &data[0],&out[0],b1);
+        C.simdunpackd1(offset,&out[0],&recovereddata[0],b1)
+        for i := 0; i < 128; i++ {
+            if data[i] != recovereddata[i]  {
+                  fmt.Println("Bug ")
+                  return
+            }
+        } 
+
+        fmt.Println("test succesful.")
+      
+}
--- a/cpp/simdcomp/include/avxbitpacking.h
+++ b/cpp/simdcomp/include/avxbitpacking.h
@@ -0,0 +1,40 @@
+/**
+ * This code is released under a BSD License.
+ */
+
+#ifndef INCLUDE_AVXBITPACKING_H_
+#define INCLUDE_AVXBITPACKING_H_
+
+
+#ifdef __AVX2__
+
+#include "portability.h"
+
+
+/* AVX2 is required */
+#include <immintrin.h>
+/* for memset */
+#include <string.h>
+
+#include "simdcomputil.h"
+
+enum{ AVXBlockSize = 256};
+
+/* max integer logarithm over a range of AVXBlockSize integers (256 integer) */
+uint32_t avxmaxbits(const uint32_t * begin);
+
+/* reads 256 values from "in", writes  "bit" 256-bit vectors to "out" */
+void avxpack(const uint32_t *  in,__m256i *  out, const uint32_t bit);
+
+/* reads 256 values from "in", writes  "bit" 256-bit vectors to "out" */
+void avxpackwithoutmask(const uint32_t *  in,__m256i *  out, const uint32_t bit);
+
+/* reads  "bit" 256-bit vectors from "in", writes  256 values to "out" */
+void avxunpack(const __m256i *  in,uint32_t *  out, const uint32_t bit);
+
+
+
+
+#endif /* __AVX2__ */
+
+#endif /* INCLUDE_AVXBITPACKING_H_ */
--- a/cpp/simdcomp/include/portability.h
+++ b/cpp/simdcomp/include/portability.h
@@ -0,0 +1,81 @@
+/**
+ * This code is released under a BSD License.
+ */
+#ifndef SIMDBITCOMPAT_H_
+#define SIMDBITCOMPAT_H_
+
+#include <iso646.h> /* mostly for Microsoft compilers */
+#include <string.h>
+
+#if SIMDCOMP_DEBUG
+# define SIMDCOMP_ALWAYS_INLINE inline
+# define SIMDCOMP_NEVER_INLINE
+# define SIMDCOMP_PURE
+#else
+# if defined(__GNUC__)
+#  if __GNUC__ >= 3
+#   define SIMDCOMP_ALWAYS_INLINE inline __attribute__((always_inline))
+#   define SIMDCOMP_NEVER_INLINE __attribute__((noinline))
+#   define SIMDCOMP_PURE __attribute__((pure))
+#  else
+#   define SIMDCOMP_ALWAYS_INLINE inline
+#   define SIMDCOMP_NEVER_INLINE
+#   define SIMDCOMP_PURE
+#  endif
+# elif defined(_MSC_VER)
+#  define SIMDCOMP_ALWAYS_INLINE __forceinline
+#  define SIMDCOMP_NEVER_INLINE
+#  define SIMDCOMP_PURE
+# else
+#  if __has_attribute(always_inline)
+#   define SIMDCOMP_ALWAYS_INLINE inline __attribute__((always_inline))
+#  else
+#   define SIMDCOMP_ALWAYS_INLINE inline
+#  endif
+#  if __has_attribute(noinline)
+#   define SIMDCOMP_NEVER_INLINE __attribute__((noinline))
+#  else
+#   define SIMDCOMP_NEVER_INLINE
+#  endif
+#  if __has_attribute(pure)
+#   define SIMDCOMP_PURE __attribute__((pure))
+#  else
+#   define SIMDCOMP_PURE
+#  endif
+# endif
+#endif
+
+#if defined(_MSC_VER) && _MSC_VER < 1600
+typedef unsigned int uint32_t;
+typedef unsigned char uint8_t;
+typedef signed char int8_t;
+#else
+#include <stdint.h> /* part of Visual Studio 2010 and better, others likely anyway */
+#endif
+
+#if defined(_MSC_VER)
+#define SIMDCOMP_ALIGNED(x) __declspec(align(x))
+#else
+#if defined(__GNUC__)
+#define SIMDCOMP_ALIGNED(x) __attribute__ ((aligned(x)))
+#endif
+#endif
+
+#if defined(_MSC_VER)
+# include <intrin.h>
+/* 64-bit needs extending */
+# define SIMDCOMP_CTZ(result, mask) do { \
+		unsigned long index; \
+		if (!_BitScanForward(&(index), (mask))) { \
+			(result) = 32U; \
+		} else { \
+			(result) = (uint32_t)(index); \
+		} \
+	} while (0)
+#else
+# define SIMDCOMP_CTZ(result, mask) \
+	result = __builtin_ctz(mask)
+#endif
+
+#endif /* SIMDBITCOMPAT_H_ */
+
--- a/cpp/simdcomp/include/simdbitpacking.h
+++ b/cpp/simdcomp/include/simdbitpacking.h
@@ -0,0 +1,72 @@
+/**
+ * This code is released under a BSD License.
+ */
+#ifndef SIMDBITPACKING_H_
+#define SIMDBITPACKING_H_
+
+#include "portability.h"
+
+/* SSE2 is required */
+#include <emmintrin.h>
+/* for memset */
+#include <string.h>
+
+#include "simdcomputil.h"
+
+/***
+* Please see example.c for various examples on how to make good use
+* of these functions.
+*/
+
+
+
+/* reads 128 values from "in", writes  "bit" 128-bit vectors to "out".
+ * The input values are masked so that only the least significant "bit" bits are used. */
+void simdpack(const uint32_t *  in,__m128i *  out, const uint32_t bit);
+
+/* reads 128 values from "in", writes  "bit" 128-bit vectors to "out".
+ * The input values are assumed to be less than 1<<bit. */
+void simdpackwithoutmask(const uint32_t *  in,__m128i *  out, const uint32_t bit);
+
+/* reads  "bit" 128-bit vectors from "in", writes  128 values to "out" */
+void simdunpack(const __m128i *  in,uint32_t *  out, const uint32_t bit);
+
+
+
+/* how many compressed bytes are needed to compressed length integers using a bit width of bit with 
+the  simdpackFOR_length function. */
+int simdpack_compressedbytes(int length, const uint32_t bit);
+
+/* like simdpack, but supports an undetermined number of inputs.
+ * This is useful if you need to unpack an array of integers that is not divisible by 128 integers.
+ * Returns a pointer to the (advanced) compressed array. Compressed data is stored in the memory location between 
+ the provided (out) pointer and the returned pointer. */
+__m128i * simdpack_length(const uint32_t *   in, size_t length, __m128i *    out, const uint32_t bit);
+
+/* like simdunpack, but supports an undetermined number of inputs.
+ * This is useful if you need to unpack an array of integers that is not divisible by 128 integers.
+ * Returns a pointer to the (advanced) compressed array. The read compressed data is between the provided 
+ (in) pointer and the returned pointer. */
+const __m128i * simdunpack_length(const __m128i *   in, size_t length, uint32_t * out, const uint32_t bit);
+
+
+
+
+/* like simdpack, but supports an undetermined small number of inputs. This is useful if you need to pack less 
+than 128 integers.
+ * Note that this function is much slower.
+ * Returns a pointer to the (advanced) compressed array. Compressed data is stored in the memory location 
+ between the provided (out) pointer and the returned pointer. */
+__m128i * simdpack_shortlength(const uint32_t *   in, int length, __m128i *    out, const uint32_t bit);
+
+/* like simdunpack, but supports an undetermined small number of inputs. This is useful if you need to unpack less
+ than 128 integers.
+ * Note that this function is much slower.
+ * Returns a pointer to the (advanced) compressed array. The read compressed data is between the provided (in) 
+ pointer and the returned pointer. */
+const __m128i * simdunpack_shortlength(const __m128i *   in, int length, uint32_t * out, const uint32_t bit);
+
+/* given a block of 128 packed values, this function sets the value at index "index" to "value" */
+void simdfastset(__m128i * in128, uint32_t b, uint32_t value, size_t index);
+
+#endif /* SIMDBITPACKING_H_ */
--- a/cpp/simdcomp/include/simdcomp.h
+++ b/cpp/simdcomp/include/simdcomp.h
@@ -0,0 +1,22 @@
+/**
+ * This code is released under a BSD License.
+ */
+
+#ifndef SIMDCOMP_H_
+#define SIMDCOMP_H_
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include "simdbitpacking.h"
+#include "simdcomputil.h"
+#include "simdfor.h"
+#include "simdintegratedbitpacking.h"
+#include "avxbitpacking.h"
+
+#ifdef __cplusplus
+} // extern "C"
+#endif
+
+#endif 
--- a/cpp/simdcomp/include/simdcomputil.h
+++ b/cpp/simdcomp/include/simdcomputil.h
@@ -0,0 +1,54 @@
+/**
+ * This code is released under a BSD License.
+ */
+
+#ifndef SIMDCOMPUTIL_H_
+#define SIMDCOMPUTIL_H_
+
+#include "portability.h"
+
+/* SSE2 is required */
+#include <emmintrin.h>
+
+
+
+
+/* returns the integer logarithm of v (bit width) */
+uint32_t bits(const uint32_t v);
+
+/* max integer logarithm over a range of SIMDBlockSize integers (128 integer) */
+uint32_t maxbits(const uint32_t * begin);
+
+/* same as maxbits, but we specify the number of integers */
+uint32_t maxbits_length(const uint32_t * in,uint32_t length);
+
+enum{ SIMDBlockSize = 128};
+
+
+/* computes (quickly) the minimal value of 128 values */
+uint32_t simdmin(const uint32_t * in);
+
+/* computes (quickly) the minimal value of the specified number of values */
+uint32_t simdmin_length(const uint32_t * in, uint32_t length);
+
+#ifdef __SSE4_1__
+/* computes (quickly) the minimal and maximal value of the specified number of values */
+void simdmaxmin_length(const uint32_t * in, uint32_t length, uint32_t * getmin, uint32_t * getmax);
+
+/* computes (quickly) the minimal and maximal value of the 128 values */
+void simdmaxmin(const uint32_t * in, uint32_t * getmin, uint32_t * getmax);
+
+#endif
+
+/* like maxbit over 128 integers (SIMDBlockSize) with provided initial value 
+   and using differential coding */
+uint32_t simdmaxbitsd1(uint32_t initvalue, const uint32_t * in);
+
+/* like simdmaxbitsd1, but calculates maxbits over |length| integers 
+   with provided initial value. |length| can be any arbitrary value. */
+uint32_t simdmaxbitsd1_length(uint32_t initvalue, const uint32_t * in,
+                uint32_t length);
+
+
+
+#endif /* SIMDCOMPUTIL_H_ */
--- a/cpp/simdcomp/include/simdfor.h
+++ b/cpp/simdcomp/include/simdfor.h
@@ -0,0 +1,72 @@
+/**
+ * This code is released under a BSD License.
+ */
+#ifndef INCLUDE_SIMDFOR_H_
+#define INCLUDE_SIMDFOR_H_
+
+#include "portability.h"
+
+/* SSE2 is required */
+#include <emmintrin.h>
+
+#include "simdcomputil.h"
+#include "simdbitpacking.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/* reads 128 values from "in", writes  "bit" 128-bit vectors to "out" */
+void simdpackFOR(uint32_t initvalue, const uint32_t *  in,__m128i *  out, const uint32_t bit);
+
+
+/* reads "bit" 128-bit vectors from "in", writes  128 values to "out" */
+void simdunpackFOR(uint32_t initvalue, const __m128i *  in,uint32_t *  out, const uint32_t bit);
+
+
+/* how many compressed bytes are needed to compressed length integers using a bit width of bit with 
+the  simdpackFOR_length function. */
+int simdpackFOR_compressedbytes(int length, const uint32_t bit);
+
+/* like simdpackFOR, but supports an undetermined number of inputs. 
+This is useful if you need to pack less than 128 integers. Note that this function is much slower. 
+ Compressed data is stored in the memory location between 
+ the provided (out) pointer and the returned pointer. */
+__m128i * simdpackFOR_length(uint32_t initvalue, const uint32_t *   in, int length, __m128i *    out, const uint32_t bit);
+
+/* like simdunpackFOR, but supports an undetermined number of inputs. 
+This is useful if you need to unpack less than 128 integers. Note that this function is much slower. 
+ The read compressed data is between the provided 
+ (in) pointer and the returned pointer.  */
+const __m128i * simdunpackFOR_length(uint32_t initvalue, const __m128i *   in, int length, uint32_t * out, const uint32_t bit);
+
+
+/* returns the value stored at the specified "slot".
+* */
+uint32_t simdselectFOR(uint32_t initvalue, const __m128i *in, uint32_t bit,
+                int slot);
+
+/* given a block of 128 packed values, this function sets the value at index "index" to "value" */
+void simdfastsetFOR(uint32_t initvalue, __m128i * in, uint32_t bit, uint32_t value, size_t index);
+
+
+/* searches "bit" 128-bit vectors from "in" (= length<=128 encoded integers) for the first encoded uint32 value
+ * which is >= |key|, and returns its position. It is assumed that the values
+ * stored are in sorted order.
+ * The encoded key is stored in "*presult".
+ * The first length decoded integers, ignoring others. If no value is larger or equal to the key,
+ * length is returned. Length should be no larger than 128.
+ *
+ * If no value is larger or equal to the key,
+* length is returned */
+int simdsearchwithlengthFOR(uint32_t initvalue, const __m128i *in, uint32_t bit,
+                int length, uint32_t key, uint32_t *presult);
+
+#ifdef __cplusplus
+} // extern "C"
+#endif
+
+
+
+
+#endif /* INCLUDE_SIMDFOR_H_ */
--- a/cpp/simdcomp/include/simdintegratedbitpacking.h
+++ b/cpp/simdcomp/include/simdintegratedbitpacking.h
@@ -0,0 +1,98 @@
+/**
+ * This code is released under a BSD License.
+ */
+
+#ifndef SIMD_INTEGRATED_BITPACKING_H
+#define SIMD_INTEGRATED_BITPACKING_H
+
+#include "portability.h"
+
+/* SSE2 is required */
+#include <emmintrin.h>
+
+#include "simdcomputil.h"
+#include "simdbitpacking.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/* reads 128 values from "in", writes  "bit" 128-bit vectors to "out"
+   integer values should be in sorted order (for best results).
+   The differences are masked so that only the least significant "bit" bits are used. */
+void simdpackd1(uint32_t initvalue, const uint32_t *  in,__m128i *  out, const uint32_t bit);
+
+
+/* reads 128 values from "in", writes  "bit" 128-bit vectors to "out"
+   integer values should be in sorted order (for best results).
+   The difference values are assumed to be less than 1<<bit. */
+void simdpackwithoutmaskd1(uint32_t initvalue, const uint32_t *  in,__m128i *  out, const uint32_t bit);
+
+
+/* reads "bit" 128-bit vectors from "in", writes  128 values to "out" */
+void simdunpackd1(uint32_t initvalue, const __m128i *  in,uint32_t *  out, const uint32_t bit);
+
+
+/* searches "bit" 128-bit vectors from "in" (= 128 encoded integers) for the first encoded uint32 value
+ * which is >= |key|, and returns its position. It is assumed that the values
+ * stored are in sorted order.
+ * The encoded key is stored in "*presult". If no value is larger or equal to the key,
+* 128 is returned. The pointer initOffset is a pointer to the last four value decoded
+* (when starting out, this can be a zero vector or initialized with _mm_set1_epi32(init)),
+* and the vector gets updated.
+**/
+int
+simdsearchd1(__m128i * initOffset, const __m128i *in, uint32_t bit,
+                uint32_t key, uint32_t *presult);
+
+
+/* searches "bit" 128-bit vectors from "in" (= length<=128 encoded integers) for the first encoded uint32 value
+ * which is >= |key|, and returns its position. It is assumed that the values
+ * stored are in sorted order.
+ * The encoded key is stored in "*presult".
+ * The first length decoded integers, ignoring others. If no value is larger or equal to the key,
+ * length is returned. Length should be no larger than 128.
+ *
+ * If no value is larger or equal to the key,
+* length is returned */
+int simdsearchwithlengthd1(uint32_t initvalue, const __m128i *in, uint32_t bit,
+                int length, uint32_t key, uint32_t *presult);
+
+
+
+/* returns the value stored at the specified "slot".
+* */
+uint32_t simdselectd1(uint32_t initvalue, const __m128i *in, uint32_t bit,
+                int slot);
+
+/* given a block of 128 packed values, this function sets the value at index "index" to "value",
+ * you must somehow know the previous value.
+ * Because of differential coding, all following values are incremented by the offset between this new
+ * value and the old value... 
+ * This functions is useful if you want to modify the last value. 
+ */
+void simdfastsetd1fromprevious( __m128i * in, uint32_t bit, uint32_t previousvalue, uint32_t value, size_t index);
+
+/* given a block of 128 packed values, this function sets the value at index "index" to "value",
+ * This function computes the previous value if needed.
+ * Because of differential coding, all following values are incremented by the offset between this new
+ * value and the old value...
+ * This functions is useful if you want to modify the last value. 
+ */
+void simdfastsetd1(uint32_t initvalue, __m128i * in, uint32_t bit, uint32_t value, size_t index);
+
+
+/*Simply scan the data
+* The pointer initOffset is a pointer to the last four value decoded
+* (when starting out, this can be a zero vector or initialized with _mm_set1_epi32(init);),
+* and the vector gets updated.
+* */
+
+void
+simdscand1(__m128i * initOffset, const __m128i *in, uint32_t bit);
+
+#ifdef __cplusplus
+} // extern "C"
+#endif
+
+#endif
--- a/cpp/simdcomp/makefile
+++ b/cpp/simdcomp/makefile
@@ -0,0 +1,79 @@
+# minimalist makefile
+.SUFFIXES:
+#
+.SUFFIXES: .cpp .o .c .h
+ifeq ($(DEBUG),1)
+CFLAGS = -fPIC  -std=c89 -ggdb -msse4.1 -march=native -Wall -Wextra -Wshadow -fsanitize=undefined  -fno-omit-frame-pointer -fsanitize=address
+else
+CFLAGS = -fPIC -std=c89 -O3 -msse4.1  -march=native -Wall -Wextra -Wshadow
+endif # debug
+LDFLAGS = -shared
+LIBNAME=libsimdcomp.so.0.0.3
+all:  unit unit_chars bitpackingbenchmark $(LIBNAME)
+test:
+	./unit
+	./unit_chars
+install: $(OBJECTS)
+	cp $(LIBNAME) /usr/local/lib
+	ln -s /usr/local/lib/$(LIBNAME) /usr/local/lib/libsimdcomp.so
+	ldconfig
+	cp $(HEADERS) /usr/local/include
+
+
+
+HEADERS=./include/simdbitpacking.h ./include/simdcomputil.h ./include/simdintegratedbitpacking.h ./include/simdcomp.h ./include/simdfor.h ./include/avxbitpacking.h
+
+uninstall:
+	for h in $(HEADERS) ; do rm  /usr/local/$$h; done
+	rm  /usr/local/lib/$(LIBNAME)
+	rm /usr/local/lib/libsimdcomp.so
+	ldconfig
+
+
+OBJECTS= simdbitpacking.o simdintegratedbitpacking.o simdcomputil.o \
+		 simdpackedsearch.o simdpackedselect.o simdfor.o avxbitpacking.o
+
+$(LIBNAME): $(OBJECTS)
+	$(CC) $(CFLAGS) -o $(LIBNAME) $(OBJECTS)  $(LDFLAGS)
+
+
+avxbitpacking.o: ./src/avxbitpacking.c $(HEADERS)
+	$(CC) $(CFLAGS) -c ./src/avxbitpacking.c -Iinclude
+
+
+simdfor.o: ./src/simdfor.c $(HEADERS)
+	$(CC) $(CFLAGS) -c ./src/simdfor.c -Iinclude
+
+
+simdcomputil.o: ./src/simdcomputil.c $(HEADERS)
+	$(CC) $(CFLAGS) -c ./src/simdcomputil.c -Iinclude
+
+simdbitpacking.o: ./src/simdbitpacking.c $(HEADERS)
+	$(CC) $(CFLAGS) -c ./src/simdbitpacking.c -Iinclude
+
+simdintegratedbitpacking.o: ./src/simdintegratedbitpacking.c  $(HEADERS)
+	$(CC) $(CFLAGS) -c ./src/simdintegratedbitpacking.c -Iinclude
+
+simdpackedsearch.o: ./src/simdpackedsearch.c $(HEADERS)
+	$(CC) $(CFLAGS) -c ./src/simdpackedsearch.c -Iinclude
+
+simdpackedselect.o: ./src/simdpackedselect.c $(HEADERS)
+	$(CC) $(CFLAGS) -c ./src/simdpackedselect.c -Iinclude
+
+example: ./example.c    $(HEADERS) $(OBJECTS)
+	$(CC) $(CFLAGS) -o example ./example.c -Iinclude  $(OBJECTS)
+
+unit: ./tests/unit.c    $(HEADERS) $(OBJECTS)
+	$(CC) $(CFLAGS) -o unit ./tests/unit.c -Iinclude  $(OBJECTS)
+
+bitpackingbenchmark: ./benchmarks/bitpackingbenchmark.c    $(HEADERS) $(OBJECTS)
+	$(CC) $(CFLAGS) -o bitpackingbenchmark ./benchmarks/bitpackingbenchmark.c -Iinclude  $(OBJECTS)
+benchmark: ./benchmarks/benchmark.c    $(HEADERS) $(OBJECTS)
+	$(CC) $(CFLAGS) -o benchmark ./benchmarks/benchmark.c -Iinclude  $(OBJECTS)
+dynunit: ./tests/unit.c    $(HEADERS) $(LIBNAME)
+	$(CC) $(CFLAGS) -o dynunit ./tests/unit.c -Iinclude  -lsimdcomp
+
+unit_chars: ./tests/unit_chars.c    $(HEADERS) $(OBJECTS)
+	$(CC) $(CFLAGS) -o unit_chars ./tests/unit_chars.c -Iinclude  $(OBJECTS)
+clean:
+	rm -f unit *.o $(LIBNAME) example benchmark bitpackingbenchmark dynunit unit_chars
--- a/cpp/simdcomp/makefile.vc
+++ b/cpp/simdcomp/makefile.vc
@@ -0,0 +1,104 @@
+
+!IFNDEF MACHINE
+!IF "$(PROCESSOR_ARCHITECTURE)"=="AMD64"
+MACHINE=x64
+!ELSE
+MACHINE=x86
+!ENDIF
+!ENDIF
+
+!IFNDEF DEBUG
+DEBUG=no
+!ENDIF
+
+!IFNDEF CC
+CC=cl.exe
+!ENDIF
+
+!IFNDEF AR
+AR=lib.exe
+!ENDIF
+
+!IFNDEF LINK
+LINK=link.exe
+!ENDIF
+
+!IFNDEF PGO
+PGO=no
+!ENDIF
+
+!IFNDEF PGI
+PGI=no
+!ENDIF
+
+INC = /Iinclude
+
+!IF "$(DEBUG)"=="yes"
+CFLAGS = /nologo /MDd /LDd /Od /Zi /D_DEBUG /RTC1 /W3 /GS /Gm
+ARFLAGS = /nologo
+LDFLAGS = /nologo /debug /nodefaultlib:msvcrt
+!ELSE
+CFLAGS = /nologo /MD /O2 /Zi /DNDEBUG /W3 /Gm- /GS /Gy /Oi /GL /MP
+ARFLAGS = /nologo /LTCG
+LDFLAGS = /nologo /LTCG /DYNAMICBASE /incremental:no /debug /opt:ref,icf
+!ENDIF
+
+!IF "$(PGI)"=="yes"
+LDFLAGS = $(LDFLAGS) /ltcg:pgi
+!ENDIF
+
+!IF "$(PGO)"=="yes"
+LDFLAGS = $(LDFLAGS) /ltcg:pgo
+!ENDIF
+
+LIB_OBJS = simdbitpacking.obj simdintegratedbitpacking.obj simdcomputil.obj \
+	simdpackedsearch.obj simdpackedselect.obj simdfor.obj
+
+
+all: lib dll dynunit unit_chars example benchmark
+# need some good use case scenario to train the instrumented build
+	@if "$(PGI)"=="yes" echo Running PGO training
+	@if "$(PGI)"=="yes" benchmark.exe >nul 2>&1
+	@if "$(PGI)"=="yes" example.exe >nul 2>&1
+
+
+$(LIB_OBJS):
+	$(CC) $(INC) $(CFLAGS) /c src/simdbitpacking.c src/simdintegratedbitpacking.c src/simdcomputil.c \
+		src/simdpackedsearch.c src/simdpackedselect.c src/simdfor.c
+
+lib: $(LIB_OBJS)
+	$(AR) $(ARFLAGS) /OUT:simdcomp_a.lib $(LIB_OBJS)
+
+dll: $(LIB_OBJS)
+	$(LINK) /DLL $(LDFLAGS) /OUT:simdcomp.dll /IMPLIB:simdcomp.lib /DEF:simdcomp.def $(LIB_OBJS)
+
+unit: lib
+	$(CC) $(INC) $(CFLAGS) /c src/unit.c 
+	$(LINK) $(LDFLAGS) /OUT:unit.exe unit.obj simdcomp_a.lib
+
+dynunit: dll
+	$(CC) $(INC) $(CFLAGS) /c src/unit.c 
+	$(LINK) $(LDFLAGS) /OUT:unit.exe unit.obj simdcomp.lib
+
+unit_chars: lib
+	$(CC) $(INC) $(CFLAGS) /c src/unit_chars.c
+	$(LINK) $(LDFLAGS) /OUT:unit_chars.exe unit_chars.obj simdcomp.lib
+
+
+example: lib
+	$(CC) $(INC) $(CFLAGS) /c example.c
+	$(LINK) $(LDFLAGS) /OUT:example.exe example.obj simdcomp.lib
+
+benchmark: lib
+	$(CC) $(INC) $(CFLAGS) /c src/benchmark.c
+	$(LINK) $(LDFLAGS) /OUT:benchmark.exe benchmark.obj simdcomp.lib
+
+clean:
+	del /Q *.obj
+	del /Q *.lib
+	del /Q *.exe
+	del /Q *.dll
+	del /Q *.pgc
+	del /Q *.pgd
+	del /Q *.pdb
+
--- a/cpp/simdcomp/package.json
+++ b/cpp/simdcomp/package.json
@@ -0,0 +1,16 @@
+{
+  "name": "simdcomp",
+  "version": "0.0.3",
+  "repo": "lemire/simdcomp",
+  "description": "A simple C library for compressing lists of integers",
+  "license": "BSD-3-Clause",
+  "src": [
+    "src/simdbitpacking.c",
+    "src/simdcomputil.c",
+    "src/simdintegratedbitpacking.c",
+    "include/simdbitpacking.h",
+    "include/simdcomp.h",
+    "include/simdcomputil.h",
+    "include/simdintegratedbitpacking.h"
+  ]
+}
--- a/cpp/simdcomp/scripts/avxpacking.py
+++ b/cpp/simdcomp/scripts/avxpacking.py
@@ -0,0 +1,182 @@
+#!/usr/bin/env python
+import sys
+def howmany(bit):
+    """ how many values are we going to pack? """
+    return 256
+
+def howmanywords(bit):
+    return (howmany(bit) * bit + 255)/256
+
+def howmanybytes(bit):
+    return howmanywords(bit) * 16
+
+print("""
+/** code generated by avxpacking.py starts here **/
+""")
+
+print("""typedef void (*avxpackblockfnc)(const uint32_t * pin, __m256i * compressed);""")
+print("""typedef void (*avxunpackblockfnc)(const __m256i * compressed, uint32_t * pout);""")
+
+
+
+
+
+
+def plurial(number):
+    if(number <> 1):
+        return "s"
+    else :
+        return ""
+
+print("")
+print("static void avxpackblock0(const uint32_t * pin, __m256i * compressed) {");
+print("  (void)compressed;");
+print("  (void) pin; /* we consumed {0} 32-bit integer{1} */ ".format(howmany(0),plurial(howmany(0))));
+print("}");
+print("")
+
+for bit in range(1,33):
+    print("")
+    print("/* we are going to pack {0} {1}-bit values, touching {2} 256-bit words, using {3} bytes */ ".format(howmany(bit),bit,howmanywords(bit),howmanybytes(bit)))
+    print("static void avxpackblock{0}(const uint32_t * pin, __m256i * compressed) {{".format(bit));
+    print("  const __m256i * in = (const __m256i *)  pin;");
+    print("  /* we are going to touch  {0} 256-bit word{1} */ ".format(howmanywords(bit),plurial(howmanywords(bit))));
+    if(howmanywords(bit) == 1):
+      print("  __m256i w0;")
+    else:
+      print("  __m256i w0, w1;")
+    if( (bit & (bit-1)) <> 0) : print("  __m256i tmp; /* used to store inputs at word boundary */")
+    oldword = 0
+    for j in range(howmany(bit)/8):
+      firstword = j * bit / 32
+      if(firstword > oldword):
+        print("  _mm256_storeu_si256(compressed + {0}, w{1});".format(oldword,oldword%2))
+        oldword = firstword
+      secondword = (j * bit + bit - 1)/32
+      firstshift = (j*bit) % 32
+      if( firstword == secondword):
+          if(firstshift == 0):
+            print("  w{0} = _mm256_lddqu_si256 (in + {1});".format(firstword%2,j))
+          else:
+            print("  w{0} = _mm256_or_si256(w{0},_mm256_slli_epi32(_mm256_lddqu_si256 (in + {1}) , {2}));".format(firstword%2,j,firstshift))
+      else:
+          print("  tmp = _mm256_lddqu_si256 (in + {0});".format(j))
+          print("  w{0} = _mm256_or_si256(w{0},_mm256_slli_epi32(tmp , {2}));".format(firstword%2,j,firstshift))
+          secondshift = 32-firstshift
+          print("  w{0} = _mm256_srli_epi32(tmp,{2});".format(secondword%2,j,secondshift))
+    print("  _mm256_storeu_si256(compressed + {0}, w{1});".format(secondword,secondword%2))
+    print("}");
+    print("")
+
+
+print("")
+print("static void avxpackblockmask0(const uint32_t * pin, __m256i * compressed) {");
+print("  (void)compressed;");
+print("  (void) pin; /* we consumed {0} 32-bit integer{1} */ ".format(howmany(0),plurial(howmany(0))));
+print("}");
+print("")
+
+for bit in range(1,33):
+    print("")
+    print("/* we are going to pack {0} {1}-bit values, touching {2} 256-bit words, using {3} bytes */ ".format(howmany(bit),bit,howmanywords(bit),howmanybytes(bit)))
+    print("static void avxpackblockmask{0}(const uint32_t * pin, __m256i * compressed) {{".format(bit));
+    print("  /* we are going to touch  {0} 256-bit word{1} */ ".format(howmanywords(bit),plurial(howmanywords(bit))));
+    if(howmanywords(bit) == 1):
+      print("  __m256i w0;")
+    else:
+      print("  __m256i w0, w1;")
+    print("  const __m256i * in = (const __m256i *) pin;");
+    if(bit < 32): print("  const __m256i mask = _mm256_set1_epi32({0});".format((1<<bit)-1));
+    def maskfnc(x):
+        if(bit == 32): return x
+        return " _mm256_and_si256 ( mask, {0}) ".format(x)
+    if( (bit & (bit-1)) <> 0) : print("  __m256i tmp; /* used to store inputs at word boundary */")
+    oldword = 0
+    for j in range(howmany(bit)/8):
+      firstword = j * bit / 32
+      if(firstword > oldword):
+        print("  _mm256_storeu_si256(compressed + {0}, w{1});".format(oldword,oldword%2))
+        oldword = firstword
+      secondword = (j * bit + bit - 1)/32
+      firstshift = (j*bit) % 32
+      loadstr = maskfnc(" _mm256_lddqu_si256 (in + {0}) ".format(j))
+      if( firstword == secondword):
+          if(firstshift == 0):
+            print("  w{0} = {1};".format(firstword%2,loadstr))
+          else:
+            print("  w{0} = _mm256_or_si256(w{0},_mm256_slli_epi32({1} , {2}));".format(firstword%2,loadstr,firstshift))
+      else:
+          print("  tmp = {0};".format(loadstr))
+          print("  w{0} = _mm256_or_si256(w{0},_mm256_slli_epi32(tmp , {2}));".format(firstword%2,j,firstshift))
+          secondshift = 32-firstshift
+          print("  w{0} = _mm256_srli_epi32(tmp,{2});".format(secondword%2,j,secondshift))
+    print("  _mm256_storeu_si256(compressed + {0}, w{1});".format(secondword,secondword%2))
+    print("}");
+    print("")
+
+
+print("static void avxunpackblock0(const __m256i * compressed, uint32_t * pout) {");
+print("  (void) compressed;");
+print("  memset(pout,0,{0});".format(howmany(0)));
+print("}");
+print("")
+
+for bit in range(1,33):
+    print("")
+    print("/* we packed {0} {1}-bit values, touching {2} 256-bit words, using {3} bytes */ ".format(howmany(bit),bit,howmanywords(bit),howmanybytes(bit)))
+    print("static void avxunpackblock{0}(const __m256i * compressed, uint32_t * pout) {{".format(bit));
+    print("  /* we are going to access  {0} 256-bit word{1} */ ".format(howmanywords(bit),plurial(howmanywords(bit))));
+    if(howmanywords(bit) == 1):
+      print("  __m256i w0;")
+    else:
+      print("  __m256i w0, w1;")
+    print("  __m256i * out = (__m256i *) pout;");
+    if(bit < 32): print("  const __m256i mask = _mm256_set1_epi32({0});".format((1<<bit)-1));
+    maskstr = " _mm256_and_si256 ( mask, {0}) "
+    if (bit == 32) : maskstr = " {0} " # no need
+    oldword = 0
+    print("  w0 = _mm256_lddqu_si256 (compressed);")
+    for j in range(howmany(bit)/8):
+      firstword = j * bit / 32
+      secondword = (j * bit + bit - 1)/32
+      if(secondword > oldword):
+        print("  w{0} = _mm256_lddqu_si256 (compressed + {1});".format(secondword%2,secondword))
+        oldword = secondword
+      firstshift = (j*bit) % 32
+      firstshiftstr = "_mm256_srli_epi32( w{0} , "+str(firstshift)+") "
+      if(firstshift == 0):
+          firstshiftstr =" w{0} " # no need
+      wfirst = firstshiftstr.format(firstword%2)
+      if( firstword == secondword):
+          if(firstshift + bit <> 32):
+            wfirst  = maskstr.format(wfirst)
+          print("  _mm256_storeu_si256(out + {0}, {1});".format(j,wfirst))
+      else:
+          secondshift = (32-firstshift)
+          wsecond = "_mm256_slli_epi32( w{0} , {1} ) ".format((firstword+1)%2,secondshift)
+          wfirstorsecond = " _mm256_or_si256 ({0},{1}) ".format(wfirst,wsecond)
+          wfirstorsecond = maskstr.format(wfirstorsecond)
+          print("  _mm256_storeu_si256(out + {0},\n    {1});".format(j,wfirstorsecond))
+    print("}");
+    print("")
+
+
+print("static avxpackblockfnc avxfuncPackArr[] = {")
+for bit in range(0,32):
+  print("&avxpackblock{0},".format(bit))
+print("&avxpackblock32")
+print("};")
+
+print("static avxpackblockfnc avxfuncPackMaskArr[] = {")
+for bit in range(0,32):
+  print("&avxpackblockmask{0},".format(bit))
+print("&avxpackblockmask32")
+print("};")
+
+
+print("static avxunpackblockfnc avxfuncUnpackArr[] = {")
+for bit in range(0,32):
+  print("&avxunpackblock{0},".format(bit))
+print("&avxunpackblock32")
+print("};")
+print("/** code generated by avxpacking.py ends here **/")
--- a/cpp/simdcomp/scripts/simdfor.py
+++ b/cpp/simdcomp/scripts/simdfor.py
@@ -0,0 +1,152 @@
+#!/usr/bin/env python3
+
+
+from math import ceil
+
+print("""
+/**
+* Blablabla
+*
+*/
+
+""");
+
+def mask(bit):
+  return str((1 << bit) - 1)
+
+for length in [32]:
+  print("""
+static __m128i  iunpackFOR0(__m128i initOffset, const __m128i *   _in , uint32_t *    _out) {
+    __m128i       *out = (__m128i*)(_out);
+    int i;
+    (void) _in;
+    for (i = 0; i < 8; ++i) {
+        _mm_store_si128(out++, initOffset);
+    	_mm_store_si128(out++, initOffset);
+        _mm_store_si128(out++, initOffset);
+        _mm_store_si128(out++, initOffset);
+    }
+
+    return initOffset;
+}
+
+  """)
+  print("""
+
+static void ipackFOR0(__m128i initOffset , const uint32_t *   _in , __m128i *  out  ) {
+    (void) initOffset;
+    (void) _in;
+    (void) out;
+}
+""") 
+  for bit in range(1,33):
+    offsetVar = " initOffset";
+    print("""  
+static void ipackFOR"""+str(bit)+"""(__m128i """+offsetVar+""", const uint32_t *   _in, __m128i *   out) {
+    const __m128i       *in = (const __m128i*)(_in);
+    __m128i    OutReg;
+
+      """);
+    
+    if (bit != 32):
+      print("    __m128i CurrIn = _mm_load_si128(in);");
+      print("    __m128i InReg = _mm_sub_epi32(CurrIn, initOffset);");
+    else:
+      print("    __m128i InReg = _mm_load_si128(in);");
+      print("    (void) initOffset;");
+
+
+    inwordpointer = 0
+    valuecounter = 0
+    for k in range(ceil((length * bit) / 32)):
+      if(valuecounter == length): break
+      for x in range(inwordpointer,32,bit):
+        if(x!=0) :
+          print("    OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, " + str(x) + "));");
+        else:
+          print("    OutReg = InReg; ");
+        if((x+bit>=32) ):
+          while(inwordpointer<32):
+            inwordpointer += bit
+          print("    _mm_store_si128(out, OutReg);");
+          print("");
+
+          if(valuecounter + 1 < length):
+            print("    ++out;")
+          inwordpointer -= 32;
+          if(inwordpointer>0):
+            print("    OutReg = _mm_srli_epi32(InReg, " + str(bit) + " - " + str(inwordpointer) + ");");
+        if(valuecounter + 1 < length):
+          print("    ++in;") 
+
+          if (bit != 32):
+            print("    CurrIn = _mm_load_si128(in);");
+            print("    InReg = _mm_sub_epi32(CurrIn, initOffset);");
+          else:
+            print("    InReg = _mm_load_si128(in);");
+          print("");
+        valuecounter = valuecounter + 1
+        if(valuecounter == length): break
+    assert(valuecounter == length)
+    print("\n}\n\n""")
+
+  for bit in range(1,32):
+    offsetVar = " initOffset";
+    print("""\n
+static __m128i iunpackFOR"""+str(bit)+"""(__m128i """+offsetVar+""", const  __m128i*   in, uint32_t *   _out) {
+      """);
+    print("""    __m128i*   out = (__m128i*)(_out);
+    __m128i    InReg = _mm_load_si128(in);
+    __m128i    OutReg;    
+    __m128i     tmp;
+    const __m128i mask =  _mm_set1_epi32((1U<<"""+str(bit)+""")-1);
+
+    """);
+
+    MainText = "";
+
+    MainText += "\n";
+    inwordpointer = 0
+    valuecounter = 0
+    for k in range(ceil((length * bit) / 32)):
+      for x in range(inwordpointer,32,bit):
+        if(valuecounter == length): break
+        if (x > 0):
+          MainText += "    tmp = _mm_srli_epi32(InReg," + str(x) +");\n"; 
+        else:
+          MainText += "    tmp = InReg;\n"; 
+        if(x+bit<32):
+          MainText += "    OutReg = _mm_and_si128(tmp, mask);\n";
+        else:
+          MainText += "    OutReg = tmp;\n";        
+        if((x+bit>=32) ):      
+          while(inwordpointer<32):
+            inwordpointer += bit
+          if(valuecounter + 1 < length):
+             MainText += "    ++in;"
+             MainText += "    InReg = _mm_load_si128(in);\n";
+          inwordpointer -= 32;
+          if(inwordpointer>0):
+            MainText += "    OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, " + str(bit) + "-" + str(inwordpointer) + "), mask));\n\n";
+        if (bit != 32):
+          MainText += "    OutReg = _mm_add_epi32(OutReg, initOffset);\n"; 
+        MainText += "    _mm_store_si128(out++, OutReg);\n\n"; 
+        MainText += "";
+        valuecounter = valuecounter + 1
+        if(valuecounter == length): break
+    assert(valuecounter == length)
+    print(MainText)
+    print("    return initOffset;");
+    print("\n}\n\n")
+  print("""
+static __m128i iunpackFOR32(__m128i initvalue , const  __m128i*   in, uint32_t *    _out) {
+	__m128i * mout = (__m128i *)_out;
+	__m128i invec;
+	size_t k;
+	for(k = 0; k < 128/4; ++k) {
+		invec =  _mm_load_si128(in++);
+	    _mm_store_si128(mout++, invec);
+	}
+	return invec;
+}
+  """)
--- a/cpp/simdcomp/simdcomp.def
+++ b/cpp/simdcomp/simdcomp.def
@@ -0,0 +1,40 @@
+EXPORTS
+	simdpack
+	simdpackwithoutmask
+	simdunpack
+	bits
+	maxbits
+	maxbits_length
+	simdmin
+	simdmin_length
+	simdmaxmin
+	simdmaxmin_length
+	simdmaxbitsd1
+	simdmaxbitsd1_length
+	simdpackd1
+	simdpackwithoutmaskd1
+	simdunpackd1
+	simdsearchd1
+	simdsearchwithlengthd1
+	simdselectd1
+	simdpackFOR
+	simdselectFOR
+	simdsearchwithlengthFOR
+	simdunpackFOR
+	simdmin_length
+	simdmaxmin
+	simdmaxmin_length
+	simdpack_length
+	simdpackFOR_length
+	simdunpackFOR_length
+	simdpack_shortlength
+	simdfastsetFOR
+	simdfastset
+	simdfastsetd1
+	simdunpack_length
+	simdunpack_shortlength
+	simdsearchwithlengthFOR
+	simdscand1
+	simdfastsetd1fromprevious
+	simdfastsetd1
+
--- a/cpp/simdcomp/src/avxbitpacking.c
+++ b/cpp/simdcomp/src/avxbitpacking.c
--- a/cpp/simdcomp/src/simdbitpacking.c
+++ b/cpp/simdcomp/src/simdbitpacking.c
--- a/cpp/simdcomp/src/simdcomputil.c
+++ b/cpp/simdcomp/src/simdcomputil.c
@@ -0,0 +1,234 @@
+/**
+ * This code is released under a BSD License.
+ */
+
+#include "simdcomputil.h"
+#ifdef __SSE4_1__
+#include <smmintrin.h>
+#endif
+#include <assert.h>
+
+#define Delta(curr, prev) \
+    _mm_sub_epi32(curr, \
+            _mm_or_si128(_mm_slli_si128(curr, 4), _mm_srli_si128(prev, 12)))
+
+/* returns the integer logarithm of v (bit width) */
+uint32_t bits(const uint32_t v) {
+#ifdef _MSC_VER
+    unsigned long answer;
+    if (v == 0) {
+        return 0;
+    }
+    _BitScanReverse(&answer, v);
+    return answer + 1;
+#else
+    return v == 0 ? 0 : 32 - __builtin_clz(v); /* assume GCC-like compiler if not microsoft */
+#endif
+}
+
+
+
+static uint32_t maxbitas32int(const __m128i accumulator) {
+	const __m128i _tmp1 = _mm_or_si128(_mm_srli_si128(accumulator, 8), accumulator); /* (A,B,C,D) xor (0,0,A,B) = (A,B,C xor A,D xor B)*/
+	const __m128i _tmp2 = _mm_or_si128(_mm_srli_si128(_tmp1, 4), _tmp1); /*  (A,B,C xor A,D xor B) xor  (0,0,0,C xor A)*/
+	uint32_t ans =  _mm_cvtsi128_si32(_tmp2);
+	return bits(ans);
+}
+
+SIMDCOMP_PURE uint32_t maxbits(const uint32_t * begin) {
+	    const __m128i* pin = (const __m128i*)(begin);
+	    __m128i accumulator = _mm_loadu_si128(pin);
+	    uint32_t k = 1;
+	    for(; 4*k < SIMDBlockSize; ++k) {
+	    	__m128i newvec = _mm_loadu_si128(pin+k);
+	        accumulator = _mm_or_si128(accumulator,newvec);
+	    }
+	    return maxbitas32int(accumulator);
+}
+static uint32_t orasint(const __m128i accumulator) {
+	const __m128i _tmp1 = _mm_or_si128(_mm_srli_si128(accumulator, 8), accumulator); /* (A,B,C,D) xor (0,0,A,B) = (A,B,C xor A,D xor B)*/
+	const __m128i _tmp2 = _mm_or_si128(_mm_srli_si128(_tmp1, 4), _tmp1); /*  (A,B,C xor A,D xor B) xor  (0,0,0,C xor A)*/
+	return  _mm_cvtsi128_si32(_tmp2);
+}
+
+#ifdef __SSE4_1__
+
+static uint32_t minasint(const __m128i accumulator) {
+	const __m128i _tmp1 = _mm_min_epu32(_mm_srli_si128(accumulator, 8), accumulator); /* (A,B,C,D) xor (0,0,A,B) = (A,B,C xor A,D xor B)*/
+	const __m128i _tmp2 = _mm_min_epu32(_mm_srli_si128(_tmp1, 4), _tmp1); /*  (A,B,C xor A,D xor B) xor  (0,0,0,C xor A)*/
+	return  _mm_cvtsi128_si32(_tmp2);
+}
+
+static uint32_t maxasint(const __m128i accumulator) {
+	const __m128i _tmp1 = _mm_max_epu32(_mm_srli_si128(accumulator, 8), accumulator); /* (A,B,C,D) xor (0,0,A,B) = (A,B,C xor A,D xor B)*/
+	const __m128i _tmp2 = _mm_max_epu32(_mm_srli_si128(_tmp1, 4), _tmp1); /*  (A,B,C xor A,D xor B) xor  (0,0,0,C xor A)*/
+	return  _mm_cvtsi128_si32(_tmp2);
+}
+
+uint32_t simdmin(const uint32_t * in) {
+    const __m128i* pin = (const __m128i*)(in);
+    __m128i accumulator =  _mm_loadu_si128(pin);
+     uint32_t k = 1;
+     for(; 4*k < SIMDBlockSize; ++k) {
+    	 __m128i newvec = _mm_loadu_si128(pin+k);
+         accumulator = _mm_min_epu32(accumulator,newvec);
+     }
+     return minasint(accumulator);
+}
+
+void simdmaxmin(const uint32_t * in, uint32_t * getmin, uint32_t * getmax) {
+    const __m128i* pin = (const __m128i*)(in);
+    __m128i minaccumulator =  _mm_loadu_si128(pin);
+    __m128i maxaccumulator =  minaccumulator;
+    uint32_t k = 1;
+     for(; 4*k < SIMDBlockSize; ++k) {
+    	 __m128i newvec = _mm_loadu_si128(pin+k);
+         minaccumulator = _mm_min_epu32(minaccumulator,newvec);
+         maxaccumulator = _mm_max_epu32(maxaccumulator,newvec);
+     }
+     *getmin = minasint(minaccumulator);
+     *getmax = maxasint(maxaccumulator);
+}
+
+
+uint32_t simdmin_length(const uint32_t * in, uint32_t length) {
+	uint32_t currentmin = 0xFFFFFFFF;
+	uint32_t lengthdividedby4 = length / 4;
+	uint32_t offset = lengthdividedby4 * 4;
+	uint32_t k;
+	if (lengthdividedby4 > 0) {
+		const __m128i* pin = (const __m128i*)(in);
+		__m128i accumulator = _mm_loadu_si128(pin);
+		k = 1;
+		for(; 4*k < lengthdividedby4 * 4; ++k) {
+			__m128i newvec = _mm_loadu_si128(pin+k);
+			accumulator = _mm_min_epu32(accumulator,newvec);
+		}
+		currentmin = minasint(accumulator);
+	}
+	for (k = offset; k < length; ++k)
+		if (in[k] < currentmin)
+			currentmin = in[k];
+	return currentmin;
+}
+
+void simdmaxmin_length(const uint32_t * in, uint32_t length, uint32_t * getmin, uint32_t * getmax) {
+	uint32_t lengthdividedby4 = length / 4;
+	uint32_t offset = lengthdividedby4 * 4;
+	uint32_t k;
+	*getmin = 0xFFFFFFFF;
+	*getmax = 0;
+	if (lengthdividedby4 > 0) {
+		const __m128i* pin = (const __m128i*)(in);
+		__m128i minaccumulator = _mm_loadu_si128(pin);
+		__m128i maxaccumulator = minaccumulator;
+		k = 1;
+		for(; 4*k < lengthdividedby4 * 4; ++k) {
+			__m128i newvec = _mm_loadu_si128(pin+k);
+			minaccumulator = _mm_min_epu32(minaccumulator,newvec);
+			maxaccumulator = _mm_max_epu32(maxaccumulator,newvec);
+		}
+		*getmin = minasint(minaccumulator);
+		*getmax = maxasint(maxaccumulator);
+	}
+	for (k = offset; k < length; ++k) {
+		if (in[k] < *getmin)
+			*getmin = in[k];
+		if (in[k] > *getmax)
+			*getmax = in[k];
+	}
+}
+
+#endif
+
+SIMDCOMP_PURE uint32_t maxbits_length(const uint32_t * in,uint32_t length) {
+	  uint32_t k;
+	  uint32_t lengthdividedby4 = length / 4;
+	  uint32_t offset = lengthdividedby4 * 4;
+	  uint32_t bigxor = 0;
+	  if(lengthdividedby4 > 0) {
+		    const __m128i* pin = (const __m128i*)(in);
+		    __m128i accumulator = _mm_loadu_si128(pin);
+		    k = 1;
+		    for(; 4*k < 4*lengthdividedby4; ++k) {
+		    	__m128i newvec = _mm_loadu_si128(pin+k);
+		        accumulator = _mm_or_si128(accumulator,newvec);
+		    }
+		    bigxor = orasint(accumulator);
+	  }
+	  for(k = offset; k < length; ++k)
+		  bigxor |= in[k];
+	  return bits(bigxor);
+}
+
+
+/* maxbit over 128 integers (SIMDBlockSize) with provided initial value */
+uint32_t simdmaxbitsd1(uint32_t initvalue, const uint32_t * in) {
+    __m128i  initoffset = _mm_set1_epi32 (initvalue);
+    const __m128i* pin = (const __m128i*)(in);
+    __m128i newvec = _mm_loadu_si128(pin);
+    __m128i accumulator = Delta(newvec , initoffset);
+    __m128i oldvec = newvec;
+    uint32_t k = 1;
+    for(; 4*k < SIMDBlockSize; ++k) {
+        newvec = _mm_loadu_si128(pin+k);
+        accumulator = _mm_or_si128(accumulator,Delta(newvec , oldvec));
+        oldvec = newvec;
+    }
+    initoffset = oldvec;
+    return maxbitas32int(accumulator);
+}
+
+
+/* maxbit over |length| integers with provided initial value */
+uint32_t simdmaxbitsd1_length(uint32_t initvalue, const uint32_t * in,
+                uint32_t length) {
+    __m128i newvec;
+    __m128i oldvec;
+    __m128i initoffset;
+    __m128i accumulator;
+    const __m128i *pin;
+    uint32_t tmparray[4];
+    uint32_t k = 1;
+    uint32_t acc;
+
+    assert(length > 0);
+
+    pin = (const __m128i *)(in);
+    initoffset = _mm_set1_epi32(initvalue);
+    switch (length) {
+      case 1:
+        newvec = _mm_set1_epi32(in[0]);
+        break;
+      case 2:
+        newvec = _mm_setr_epi32(in[0], in[1], in[1], in[1]);
+        break;
+      case 3:
+        newvec = _mm_setr_epi32(in[0], in[1], in[2], in[2]);
+        break;
+      default:
+        newvec = _mm_loadu_si128(pin);
+        break;
+    }
+    accumulator = Delta(newvec, initoffset);
+    oldvec = newvec;
+
+    /* process 4 integers and build an accumulator */
+    while (k * 4 + 4 <= length) {
+        newvec = _mm_loadu_si128(pin + k);
+        accumulator = _mm_or_si128(accumulator, Delta(newvec, oldvec));
+        oldvec = newvec;
+        k++;
+    }
+
+    /* extract the accumulator as an integer */
+    _mm_storeu_si128((__m128i *)(tmparray), accumulator);
+    acc = tmparray[0] | tmparray[1] | tmparray[2] | tmparray[3];
+
+    /* now process the remaining integers */
+    for (k *= 4; k < length; k++)
+        acc |= in[k] - (k == 0 ? initvalue : in[k - 1]);
+
+    /* return the number of bits */
+    return bits(acc);
+}
--- a/cpp/simdcomp/src/simdfor.c
+++ b/cpp/simdcomp/src/simdfor.c
--- a/cpp/simdcomp/src/simdintegratedbitpacking.c
+++ b/cpp/simdcomp/src/simdintegratedbitpacking.c
--- a/cpp/simdcomp/src/simdpackedsearch.c
+++ b/cpp/simdcomp/src/simdpackedsearch.c
--- a/cpp/simdcomp/src/simdpackedselect.c
+++ b/cpp/simdcomp/src/simdpackedselect.c
--- a/cpp/simdcomp/tests/unit.c
+++ b/cpp/simdcomp/tests/unit.c
@@ -0,0 +1,900 @@
+/**
+ * This code is released under a BSD License.
+ */
+#include <assert.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include "simdcomp.h"
+
+
+
+int testshortpack() {
+	int bit;
+	size_t i;
+	size_t length;
+	__m128i * bb;
+	srand(0);
+	printf("testshortpack\n");
+	for (bit = 0; bit < 32; ++bit) {
+		const size_t N = 128;
+		uint32_t * data = malloc(N * sizeof(uint32_t));
+		uint32_t * backdata = malloc(N * sizeof(uint32_t));
+		uint32_t * buffer = malloc((2 * N + 1024) * sizeof(uint32_t));
+
+		for (i = 0; i < N; ++i) {
+			data[i] = rand() & ((1 << bit) - 1);
+		}
+		for (length = 0; length <= N; ++length) {
+			for (i = 0; i < N; ++i) {
+				backdata[i] = 0;
+			}
+			bb = simdpack_shortlength(data, length, (__m128i *) buffer,
+					bit);
+			if((bb - (__m128i *) buffer) * sizeof(__m128i) != (unsigned) simdpack_compressedbytes(length,bit)) {
+			 printf("bug\n");
+			 return -1;
+			}
+			simdunpack_shortlength((__m128i *) buffer, length,
+					backdata, bit);
+			for (i = 0; i < length; ++i) {
+
+				if (data[i] != backdata[i]) {
+				    printf("bug\n");
+					return -1;
+				}
+			}
+		}
+		free(data);
+		free(backdata);
+		free(buffer);
+	}
+	return 0;
+}
+
+int testlongpack() {
+	int bit;
+	size_t i;
+	size_t length;
+	__m128i * bb;
+	srand(0);
+	printf("testlongpack\n");
+	for (bit = 0; bit < 32; ++bit) {
+		const size_t N = 2048;
+		uint32_t * data = malloc(N * sizeof(uint32_t));
+		uint32_t * backdata = malloc(N * sizeof(uint32_t));
+		uint32_t * buffer = malloc((2 * N + 1024) * sizeof(uint32_t));
+
+		for (i = 0; i < N; ++i) {
+			data[i] = rand() & ((1 << bit) - 1);
+		}
+		for (length = 0; length <= N; ++length) {
+			for (i = 0; i < N; ++i) {
+				backdata[i] = 0;
+			}
+			bb = simdpack_length(data, length, (__m128i *) buffer,
+					bit);
+			if((bb - (__m128i *) buffer) * sizeof(__m128i) != (unsigned) simdpack_compressedbytes(length,bit)) {
+			 printf("bug\n");
+			 return -1;
+			}
+			simdunpack_length((__m128i *) buffer, length,
+					backdata, bit);
+			for (i = 0; i < length; ++i) {
+
+				if (data[i] != backdata[i]) {
+				    printf("bug\n");
+					return -1;
+				}
+			}
+		}
+		free(data);
+		free(backdata);
+		free(buffer);
+	}
+	return 0;
+}
+
+
+
+int testset() {
+	int bit;
+	size_t i;
+	const size_t N = 128;
+	uint32_t * data = malloc(N * sizeof(uint32_t));
+	uint32_t * backdata = malloc(N * sizeof(uint32_t));
+	uint32_t * buffer = malloc((2 * N + 1024) * sizeof(uint32_t));
+
+	srand(0);
+
+	for (bit = 0; bit < 32; ++bit) {
+		printf("simple set %d \n",bit);
+
+		for (i = 0; i < N; ++i) {
+			data[i] = rand() & ((1 << bit) - 1);
+		}
+		for (i = 0; i < N; ++i) {
+			backdata[i] = 0;
+		}
+		simdpack(data, (__m128i *) buffer, bit);
+		simdunpack((__m128i *) buffer, backdata, bit);
+		for (i = 0; i < N; ++i) {
+			if (data[i] != backdata[i]) {
+			    printf("bug\n");
+				return -1;
+			}
+		}
+
+		for(i = N  ; i > 0; i--) {
+			simdfastset((__m128i *) buffer, bit, data[N - i], i - 1);
+		}
+		simdunpack((__m128i *) buffer, backdata, bit);
+		for (i = 0; i < N; ++i) {
+			if (data[i] != backdata[N - i - 1]) {
+			    printf("bug\n");
+				return -1;
+			}
+		}
+		simdpack(data, (__m128i *) buffer, bit);
+		for(i = 1  ; i <= N; i++) {
+			simdfastset((__m128i *) buffer, bit, data[i - 1], i - 1);
+		}
+		simdunpack((__m128i *) buffer, backdata, bit);
+		for (i = 0; i < N; ++i) {
+			if (data[i] != backdata[i]) {
+			    printf("bug\n");
+				return -1;
+			}
+		}
+
+	}
+	free(data);
+	free(backdata);
+	free(buffer);
+
+	return 0;
+}
+
+#ifdef __SSE4_1__
+
+int testsetd1() {
+	int bit;
+	size_t i;
+	uint32_t newvalue;
+	const size_t N = 128;
+	uint32_t * data = malloc(N * sizeof(uint32_t));
+	uint32_t * datazeroes = malloc(N * sizeof(uint32_t));
+
+	uint32_t * backdata = malloc(N * sizeof(uint32_t));
+	uint32_t * buffer = malloc((2 * N + 1024) * sizeof(uint32_t));
+
+	srand(0);
+	for (bit = 0; bit < 32; ++bit) {
+		printf("simple set d1 %d \n",bit);
+		data[0] = rand() & ((1 << bit) - 1);
+		datazeroes[0] = 0;
+
+		for (i = 1; i < N; ++i) {
+			data[i] = data[i - 1] + (rand() & ((1 << bit) - 1));
+			datazeroes[i] = 0;
+		}
+		for (i = 0; i < N; ++i) {
+			backdata[i] = 0;
+		}
+		simdpackd1(0,datazeroes, (__m128i *) buffer, bit);
+ 	    for(i = 1  ; i <= N; i++) {
+			simdfastsetd1(0,(__m128i *) buffer, bit, data[i - 1], i - 1);
+			newvalue = simdselectd1(0, (const __m128i *) buffer, bit,i - 1);
+			if( newvalue != data[i-1] ) {
+				printf("bad set-select\n");
+				return -1;
+			}
+		}
+		simdunpackd1(0,(__m128i *) buffer, backdata, bit);
+		for (i = 0; i < N; ++i) {
+			if (data[i] != backdata[i])
+				return -1;
+		}
+	}
+	free(data);
+	free(backdata);
+	free(buffer);
+        free(datazeroes);
+	return 0;
+}
+#endif
+
+int testsetFOR() {
+	int bit;
+	size_t i;
+	uint32_t newvalue;
+	const size_t N = 128;
+	uint32_t * data = malloc(N * sizeof(uint32_t));
+	uint32_t * datazeroes = malloc(N * sizeof(uint32_t));
+
+	uint32_t * backdata = malloc(N * sizeof(uint32_t));
+	uint32_t * buffer = malloc((2 * N + 1024) * sizeof(uint32_t));
+
+	srand(0);
+	for (bit = 0; bit < 32; ++bit) {
+		printf("simple set FOR %d \n",bit);
+		for (i = 0; i < N; ++i) {
+			data[i] = (rand() & ((1 << bit) - 1));
+			datazeroes[i] = 0;
+		}
+		for (i = 0; i < N; ++i) {
+			backdata[i] = 0;
+		}
+		simdpackFOR(0,datazeroes, (__m128i *) buffer, bit);
+ 	    for(i = 1  ; i <= N; i++) {
+ 	    	simdfastsetFOR(0,(__m128i *) buffer, bit, data[i - 1], i - 1);
+			newvalue = simdselectFOR(0, (const __m128i *) buffer, bit,i - 1);
+			if( newvalue != data[i-1] ) {
+				printf("bad set-select\n");
+				return -1;
+			}
+		}
+		simdunpackFOR(0,(__m128i *) buffer, backdata, bit);
+		for (i = 0; i < N; ++i) {
+			if (data[i] != backdata[i])
+				return -1;
+		}
+	}
+	free(data);
+	free(backdata);
+	free(buffer);
+        free(datazeroes);
+	return 0;
+}
+
+int testshortFORpack() {
+	int bit;
+	size_t i;
+	__m128i * rb;
+	size_t length;
+	uint32_t offset = 7;
+	srand(0);
+	for (bit = 0; bit < 32; ++bit) {
+		const size_t N = 128;
+		uint32_t * data = malloc(N * sizeof(uint32_t));
+		uint32_t * backdata = malloc(N * sizeof(uint32_t));
+		uint32_t * buffer = malloc((2 * N + 1024) * sizeof(uint32_t));
+
+		for (i = 0; i < N; ++i) {
+			data[i] = (rand() & ((1 << bit) - 1)) + offset;
+		}
+		for (length = 0; length <= N; ++length) {
+			for (i = 0; i < N; ++i) {
+				backdata[i] = 0;
+			}
+			rb = simdpackFOR_length(offset,data, length, (__m128i *) buffer,
+					bit);
+		    if(((rb - (__m128i *) buffer)*sizeof(__m128i)) != (unsigned) simdpackFOR_compressedbytes(length,bit)) {
+		      return -1;
+		    }
+			simdunpackFOR_length(offset,(__m128i *) buffer, length,
+					backdata, bit);
+			for (i = 0; i < length; ++i) {
+
+				if (data[i] != backdata[i])
+					return -1;
+			}
+		}
+		free(data);
+		free(backdata);
+		free(buffer);
+	}
+	return 0;
+}
+
+
+#ifdef __AVX2__
+
+int testbabyavx() {
+	int bit;
+	int trial;
+	unsigned int i,j;
+	const size_t N = AVXBlockSize;
+	srand(0);
+	printf("testbabyavx\n");
+	printf("bit = ");
+	for (bit = 0; bit < 32; ++bit) {
+		printf(" %d ",bit);
+		fflush(stdout);
+		for(trial = 0; trial < 100; ++trial) {
+			uint32_t * data = malloc(N * sizeof(uint32_t)+ 64 * sizeof(uint32_t));
+			uint32_t * backdata = malloc(N * sizeof(uint32_t) + 64 * sizeof(uint32_t) );
+			__m256i * buffer = malloc((2 * N + 1024) * sizeof(uint32_t) + 32);
+
+			for (i = 0; i < N; ++i) {
+				data[i] = rand() & ((uint32_t)(1 << bit) - 1);
+			}
+			for (i = 0; i < N; ++i) {
+				backdata[i] = 0;
+			}
+            if(avxmaxbits(data) != maxbits_length(data,N)) {
+            	printf("avxmaxbits is buggy\n");
+				return -1;
+            }
+
+			avxpackwithoutmask(data, buffer, bit);
+			avxunpack(buffer, backdata, bit);
+			for (i = 0; i < AVXBlockSize; ++i) {
+				if (data[i] != backdata[i]) {
+					printf("bug\n");
+					for (j = 0; j < N; ++j) {
+						if (data[j] != backdata[j]) {
+							printf("data[%d]=%d v.s. backdata[%d]=%d\n",j,data[j],j,backdata[j]);
+						} else {
+							printf("data[%d]=%d\n",j,data[j]);
+						}
+					}
+					return -1;
+				}
+			}
+			free(data);
+			free(backdata);
+			free(buffer);
+		}
+	}
+	printf("\n");
+	return 0;
+}
+
+int testavx2() {
+    int N = 5000 * AVXBlockSize, gap;
+    __m256i * buffer = malloc(AVXBlockSize * sizeof(uint32_t));
+    uint32_t * datain = malloc(N * sizeof(uint32_t));
+    uint32_t * backbuffer = malloc(AVXBlockSize * sizeof(uint32_t));
+    for (gap = 1; gap <= 387420489; gap *= 3) {
+        int k;
+        printf(" gap = %u \n", gap);
+        for (k = 0; k < N; ++k)
+            datain[k] = k * gap;
+        for (k = 0; k * AVXBlockSize < N; ++k) {
+            /*
+               First part works for general arrays (sorted or unsorted)
+            */
+            int j;
+       	    /* we compute the bit width */
+            const uint32_t b = avxmaxbits(datain + k * AVXBlockSize);
+            if(avxmaxbits(datain + k * AVXBlockSize) != maxbits_length(datain + k * AVXBlockSize,AVXBlockSize)) {
+            	printf("avxmaxbits is buggy %d %d \n",
+            			avxmaxbits(datain + k * AVXBlockSize),
+						maxbits_length(datain + k * AVXBlockSize,AVXBlockSize));
+				return -1;
+            }
+            printf("bit width = %d\n",b);
+
+
+            /* we read 256 integers at "datain + k * AVXBlockSize" and
+               write b 256-bit vectors at "buffer" */
+            avxpackwithoutmask(datain + k * AVXBlockSize, buffer, b);
+            /* we read back b1 128-bit vectors at "buffer" and write 128 integers at backbuffer */
+			avxunpack(buffer, backbuffer, b);/* uncompressed */
+			for (j = 0; j < AVXBlockSize; ++j) {
+				if (backbuffer[j] != datain[k * AVXBlockSize + j]) {
+					int i;
+					printf("bug in avxpack\n");
+					for(i = 0; i < AVXBlockSize; ++i) {
+						printf("data[%d]=%d got back %d %s\n",i,
+								datain[k * AVXBlockSize + i],backbuffer[i],
+								datain[k * AVXBlockSize + i]!=backbuffer[i]?"bug":"");
+					}
+					return -2;
+				}
+			}
+        }
+    }
+    free(buffer);
+    free(datain);
+    free(backbuffer);
+    printf("Code looks good.\n");
+    return 0;
+}
+#endif /* avx2 */
+
+int test() {
+    int N = 5000 * SIMDBlockSize, gap;
+    __m128i * buffer = malloc(SIMDBlockSize * sizeof(uint32_t));
+    uint32_t * datain = malloc(N * sizeof(uint32_t));
+    uint32_t * backbuffer = malloc(SIMDBlockSize * sizeof(uint32_t));
+    for (gap = 1; gap <= 387420489; gap *= 3) {
+        int k;
+        printf(" gap = %u \n", gap);
+        for (k = 0; k < N; ++k)
+            datain[k] = k * gap;
+        for (k = 0; k * SIMDBlockSize < N; ++k) {
+            /*
+               First part works for general arrays (sorted or unsorted)
+            */
+            int j;
+       	    /* we compute the bit width */
+            const uint32_t b = maxbits(datain + k * SIMDBlockSize);
+            /* we read 128 integers at "datain + k * SIMDBlockSize" and
+               write b 128-bit vectors at "buffer" */
+            simdpackwithoutmask(datain + k * SIMDBlockSize, buffer, b);
+            /* we read back b1 128-bit vectors at "buffer" and write 128 integers at backbuffer */
+            simdunpack(buffer, backbuffer, b);/* uncompressed */
+            for (j = 0; j < SIMDBlockSize; ++j) {
+                if (backbuffer[j] != datain[k * SIMDBlockSize + j]) {
+                    printf("bug in simdpack\n");
+                    return -2;
+                }
+            }
+
+	    {
+                /*
+                 next part assumes that the data is sorted (uses differential coding)
+                */
+                uint32_t offset = 0;
+                /* we compute the bit width */
+                const uint32_t b1 = simdmaxbitsd1(offset,
+                    datain + k * SIMDBlockSize);
+               /* we read 128 integers at "datain + k * SIMDBlockSize" and
+                  write b1 128-bit vectors at "buffer" */
+               simdpackwithoutmaskd1(offset, datain + k * SIMDBlockSize, buffer,
+                    b1);
+               /* we read back b1 128-bit vectors at "buffer" and write 128 integers at backbuffer */
+               simdunpackd1(offset, buffer, backbuffer, b1);
+               for (j = 0; j < SIMDBlockSize; ++j) {
+                   if (backbuffer[j] != datain[k * SIMDBlockSize + j]) {
+                       printf("bug in simdpack d1\n");
+                       return -3;
+                   }
+               }
+               offset = datain[k * SIMDBlockSize + SIMDBlockSize - 1];
+	    }
+        }
+    }
+    free(buffer);
+    free(datain);
+    free(backbuffer);
+    printf("Code looks good.\n");
+    return 0;
+}
+
+#ifdef __SSE4_1__
+int testFOR() {
+    int N = 5000 * SIMDBlockSize, gap;
+    __m128i * buffer = malloc(SIMDBlockSize * sizeof(uint32_t));
+    uint32_t * datain = malloc(N * sizeof(uint32_t));
+    uint32_t * backbuffer = malloc(SIMDBlockSize * sizeof(uint32_t));
+    uint32_t tmax, tmin, tb;
+    for (gap = 1; gap <= 387420489; gap *= 2) {
+        int k;
+        printf(" gap = %u \n", gap);
+        for (k = 0; k < N; ++k)
+            datain[k] = k * gap;
+        for (k = 0; k * SIMDBlockSize < N; ++k) {
+            int j;
+            simdmaxmin_length(datain + k * SIMDBlockSize,SIMDBlockSize,&tmin,&tmax);
+       	    /* we compute the bit width */
+            tb  = bits(tmax - tmin);
+
+
+            /* we read 128 integers at "datain + k * SIMDBlockSize" and
+               write b 128-bit vectors at "buffer" */
+            simdpackFOR(tmin,datain + k * SIMDBlockSize, buffer, tb);
+
+            for (j = 0; j < SIMDBlockSize; ++j) {
+                        uint32_t selectedvalue = simdselectFOR(tmin,buffer,tb,j);
+                    	if (selectedvalue != datain[k * SIMDBlockSize + j]) {
+                            printf("bug in simdselectFOR\n");
+                            return -3;
+                        }
+            }
+            /* we read back b1 128-bit vectors at "buffer" and write 128 integers at backbuffer */
+            simdunpackFOR(tmin,buffer, backbuffer, tb);/* uncompressed */
+            for (j = 0; j < SIMDBlockSize; ++j) {
+            	if (backbuffer[j] != datain[k * SIMDBlockSize + j]) {
+                    printf("bug in simdpackFOR\n");
+                    return -2;
+                }
+            }
+        }
+    }
+    free(buffer);
+    free(datain);
+    free(backbuffer);
+    printf("Code looks good.\n");
+    return 0;
+}
+#endif
+
+#define MAX 300
+int test_simdmaxbitsd1_length() {
+    uint32_t result, buffer[MAX + 1];
+    int i, j;
+
+    memset(&buffer[0], 0xff, sizeof(buffer));
+
+    /* this test creates buffers of different length; each buffer is
+     * initialized to result in the following deltas:
+     * length 1: 2
+     * length 2: 1 2
+     * length 3: 1 1 2
+     * length 4: 1 1 1 2
+     * length 5: 1 1 1 1 2
+     * etc. Each sequence's "maxbits" is 2. */
+    for (i = 0; i < MAX; i++) {
+      for (j = 0; j < i; j++)
+        buffer[j] = j + 1;
+      buffer[i] = i + 2;
+
+      result = simdmaxbitsd1_length(0, &buffer[0], i + 1);
+      if (result != 2) {
+        printf("simdmaxbitsd1_length: unexpected result %u in loop %d\n",
+                result, i);
+        return -1;
+      }
+    }
+    printf("simdmaxbitsd1_length: ok\n");
+    return 0;
+}
+
+int uint32_cmp(const void *a, const void *b)
+{
+    const uint32_t *ia = (const uint32_t *)a;
+    const uint32_t *ib = (const uint32_t *)b;
+    if(*ia < *ib)
+    	return -1;
+    else if (*ia > *ib)
+    	return 1;
+    return 0;
+}
+
+#ifdef __SSE4_1__
+int test_simdpackedsearch() {
+    uint32_t buffer[128];
+    uint32_t result = 0;
+    int b, i;
+    uint32_t init = 0;
+    __m128i initial = _mm_set1_epi32(init);
+
+    /* initialize the buffer */
+    for (i = 0; i < 128; i++)
+        buffer[i] = (uint32_t)(i + 1);
+
+    /* this test creates delta encoded buffers with different bits, then
+     * performs lower bound searches for each key */
+    for (b = 1; b <= 32; b++) {
+        uint32_t out[128];
+        /* delta-encode to 'i' bits */
+        simdpackwithoutmaskd1(init, buffer, (__m128i *)out, b);
+        initial = _mm_setzero_si128();
+        printf("simdsearchd1: %d bits\n", b);
+
+        /* now perform the searches */
+        initial = _mm_set1_epi32(init);
+        assert(simdsearchd1(&initial, (__m128i *)out, b, 0, &result) == 0);
+        assert(result > 0);
+
+        for (i = 1; i <= 128; i++) {
+        	initial = _mm_set1_epi32(init);
+            assert(simdsearchd1(&initial, (__m128i *)out, b,
+                                    (uint32_t)i, &result) == i - 1);
+            assert(result == (unsigned)i);
+        }
+        initial = _mm_set1_epi32(init);
+        assert(simdsearchd1(&initial, (__m128i *)out, b, 200, &result)
+                        == 128);
+        assert(result > 200);
+    }
+    printf("simdsearchd1: ok\n");
+    return 0;
+}
+
+int test_simdpackedsearchFOR() {
+    uint32_t buffer[128];
+    uint32_t result = 0;
+    int b;
+    uint32_t i;
+    uint32_t maxv, tmin, tmax, tb;
+    uint32_t out[128];
+
+    /* this test creates delta encoded buffers with different bits, then
+     * performs lower bound searches for each key */
+    for (b = 1; b <= 32; b++) {
+        /* initialize the buffer */
+    	maxv = (b == 32)
+    			? 0xFFFFFFFF
+    					: ((1U<<b) - 1);
+        for (i = 0; i < 128; i++)
+            buffer[i] = maxv * (i + 1) / 128;
+        simdmaxmin_length(buffer,SIMDBlockSize,&tmin,&tmax);
+   	    /* we compute the bit width */
+        tb  = bits(tmax - tmin);
+        /* delta-encode to 'i' bits */
+        simdpackFOR(tmin, buffer, (__m128i *)out, tb);
+        printf("simdsearchd1: %d bits\n", b);
+
+        /* now perform the searches */
+        for (i = 0; i < 128; i++) {
+        	assert(buffer[i] == simdselectFOR(tmin, (__m128i *)out, tb,i));
+        }
+        for (i = 0; i < 128; i++) {
+            int x = simdsearchwithlengthFOR(tmin, (__m128i *)out, tb,
+                                    128,buffer[i], &result) ;
+            assert(simdselectFOR(tmin, (__m128i *)out, tb,x) == buffer[x]);
+            assert(simdselectFOR(tmin, (__m128i *)out, tb,x) == result);
+            assert(buffer[x] == result);
+            assert(result == buffer[i]);
+            assert(buffer[x] == buffer[i]);
+        }
+    }
+    printf("simdsearchFOR: ok\n");
+    return 0;
+}
+
+int test_simdpackedsearch_advanced() {
+    uint32_t buffer[128];
+    uint32_t backbuffer[128];
+	uint32_t out[128];
+    uint32_t result = 0;
+    uint32_t b, i;
+    uint32_t init = 0;
+    __m128i initial = _mm_set1_epi32(init);
+
+
+    /* this test creates delta encoded buffers with different bits, then
+     * performs lower bound searches for each key */
+    for (b = 0; b <= 32; b++) {
+    	uint32_t prev = init;
+        /* initialize the buffer */
+        for (i = 0; i < 128; i++) {
+            buffer[i] =  ((uint32_t)(1431655765 * i + 0xFFFFFFFF)) ;
+            if(b < 32) buffer[i] %= (1<<b);
+        }
+
+        qsort(buffer,128, sizeof(uint32_t), uint32_cmp);
+
+        for (i = 0; i < 128; i++) {
+           buffer[i] = buffer[i] + prev;
+           prev = buffer[i];
+        }
+        for (i = 1; i < 128; i++) {
+        	if(buffer[i] < buffer[i-1] )
+        		buffer[i] = buffer[i-1];
+        }
+        assert(simdmaxbitsd1(init, buffer)<=b);
+        for (i = 0; i < 128; i++) {
+        	out[i] = 0; /* memset would do too */
+        }
+
+        /* delta-encode to 'i' bits */
+        simdpackwithoutmaskd1(init, buffer, (__m128i *)out, b);
+        simdunpackd1(init,  (__m128i *)out, backbuffer, b);
+
+        for (i = 0; i < 128; i++) {
+        	assert(buffer[i] == backbuffer[i]);
+        }
+
+        printf("advanced simdsearchd1: %d bits\n", b);
+
+        for (i = 0; i < 128; i++) {
+        	int pos;
+            initial = _mm_set1_epi32(init);
+        	pos = simdsearchd1(&initial, (__m128i *)out, b,
+                    buffer[i], &result);
+        	assert(pos == simdsearchwithlengthd1(init, (__m128i *)out, b, 128,
+                    buffer[i], &result));
+        	assert(buffer[pos] == buffer[i]);
+            if(pos > 0)
+            	assert(buffer[pos - 1] < buffer[i]);
+            assert(result == buffer[i]);
+        }
+        for (i = 0; i < 128; i++) {
+        	int pos;
+        	if(buffer[i] == 0) continue;
+        	initial = _mm_set1_epi32(init);
+        	pos = simdsearchd1(&initial, (__m128i *)out, b,
+                    buffer[i] - 1, &result);
+        	assert(pos == simdsearchwithlengthd1(init, (__m128i *)out, b, 128,
+                    buffer[i] - 1, &result));
+        	assert(buffer[pos] >= buffer[i]  - 1);
+            if(pos > 0)
+            	assert(buffer[pos - 1] < buffer[i]  - 1);
+            assert(result == buffer[pos]);
+        }
+		for (i = 0; i < 128; i++) {
+			int pos;
+			if (buffer[i] + 1 == 0)
+				continue;
+			initial = _mm_set1_epi32(init);
+			pos = simdsearchd1(&initial, (__m128i *) out, b,
+					buffer[i] + 1, &result);
+			assert(pos == simdsearchwithlengthd1(init, (__m128i *)out, b, 128,
+                    buffer[i] + 1, &result));
+			if(pos == 128) {
+				assert(buffer[i] == buffer[127]);
+			} else {
+			  assert(buffer[pos] >= buffer[i] + 1);
+			  if (pos > 0)
+				assert(buffer[pos - 1] < buffer[i] + 1);
+			  assert(result == buffer[pos]);
+			}
+		}
+    }
+    printf("advanced simdsearchd1: ok\n");
+    return 0;
+}
+
+int test_simdpackedselect() {
+    uint32_t buffer[128];
+    uint32_t initial = 33;
+    int b, i;
+
+    /* initialize the buffer */
+    for (i = 0; i < 128; i++)
+        buffer[i] = (uint32_t)(initial + i);
+
+    /* this test creates delta encoded buffers with different bits, then
+     * performs lower bound searches for each key */
+    for (b = 1; b <= 32; b++) {
+        uint32_t out[128];
+        /* delta-encode to 'i' bits */
+        simdpackwithoutmaskd1(initial, buffer, (__m128i *)out, b);
+
+        printf("simdselectd1: %d bits\n", b);
+
+        /* now perform the searches */
+        for (i = 0; i < 128; i++) {
+            assert(simdselectd1(initial, (__m128i *)out, b, (uint32_t)i)
+                            == initial + i);
+        }
+    }
+    printf("simdselectd1: ok\n");
+    return 0;
+}
+
+int test_simdpackedselect_advanced() {
+    uint32_t buffer[128];
+    uint32_t initial = 33;
+    uint32_t b;
+    int i;
+
+    /* this test creates delta encoded buffers with different bits, then
+     * performs lower bound searches for each key */
+    for (b = 0; b <= 32; b++) {
+        uint32_t prev = initial;
+    	uint32_t out[128];
+        /* initialize the buffer */
+        for (i = 0; i < 128; i++) {
+            buffer[i] =  ((uint32_t)(165576 * i)) ;
+            if(b < 32) buffer[i] %= (1<<b);
+        }
+        for (i = 0; i < 128; i++) {
+           buffer[i] = buffer[i] + prev;
+           prev = buffer[i];
+        }
+
+        for (i = 1; i < 128; i++) {
+        	if(buffer[i] < buffer[i-1] )
+        		buffer[i] = buffer[i-1];
+        }
+        assert(simdmaxbitsd1(initial, buffer)<=b);
+
+        for (i = 0; i < 128; i++) {
+        	out[i] = 0; /* memset would do too */
+        }
+
+        /* delta-encode to 'i' bits */
+        simdpackwithoutmaskd1(initial, buffer, (__m128i *)out, b);
+
+        printf("simdselectd1: %d bits\n", b);
+
+        /* now perform the searches */
+        for (i = 0; i < 128; i++) {
+        	uint32_t valretrieved = simdselectd1(initial, (__m128i *)out, b, (uint32_t)i);
+            assert(valretrieved == buffer[i]);
+        }
+    }
+    printf("advanced simdselectd1: ok\n");
+    return 0;
+}
+#endif
+
+
+int main() {
+    int r;
+    r =  testsetFOR();
+    if (r) {
+         printf("test failure 1\n");
+         return r;
+    }
+
+#ifdef __SSE4_1__
+    r =  testsetd1();
+    if (r) {
+         printf("test failure 2\n");
+         return r;
+    }
+#endif
+    r =  testset();
+    if (r) {
+         printf("test failure 3\n");
+         return r;
+    }
+
+    r = testshortFORpack();
+    if (r) {
+         printf("test failure 4\n");
+         return r;
+    }
+    r = testshortpack();
+    if (r) {
+         printf("test failure 5\n");
+         return r;
+    }
+    r = testlongpack();
+    if (r) {
+         printf("test failure 6\n");
+         return r;
+    }
+#ifdef __SSE4_1__
+    r = test_simdpackedsearchFOR();
+    if (r) {
+         printf("test failure 7\n");
+         return r;
+    }
+
+    r = testFOR();
+    if (r) {
+         printf("test failure 8\n");
+         return r;
+    }
+#endif
+#ifdef __AVX2__
+    r= testbabyavx();
+    if (r) {
+         printf("test failure baby avx\n");
+         return r;
+    }
+
+    r = testavx2();
+    if (r) {
+         printf("test failure 9 avx\n");
+         return r;
+    }
+#endif
+    r = test();
+    if (r) {
+         printf("test failure 9\n");
+         return r;
+    }
+
+    r = test_simdmaxbitsd1_length();
+    if (r) {
+         printf("test failure 10\n");
+         return r;
+    }
+#ifdef __SSE4_1__
+    r = test_simdpackedsearch();
+    if (r) {
+         printf("test failure 11\n");
+         return r;
+    }
+
+    r = test_simdpackedsearch_advanced();
+    if (r) {
+         printf("test failure 12\n");
+         return r;
+    }
+
+    r = test_simdpackedselect();
+    if (r) {
+         printf("test failure 13\n");
+         return r;
+    }
+
+    r = test_simdpackedselect_advanced();
+    if (r) {
+         printf("test failure 14\n");
+         return r;
+    }
+#endif
+    printf("All tests OK!\n");
+
+
+    return 0;
+}
--- a/cpp/simdcomp/tests/unit_chars.c
+++ b/cpp/simdcomp/tests/unit_chars.c
@@ -0,0 +1,102 @@
+/**
+ * This code is released under a BSD License.
+ */
+#include <stdio.h>
+#include <stdlib.h>
+#include <time.h>
+#include "simdcomp.h"
+
+
+#define get_random_char() (uint8_t)(rand() % 256);
+
+
+int main() {
+    int N = 5000 * SIMDBlockSize, gap;
+    __m128i * buffer = malloc(SIMDBlockSize * sizeof(uint32_t));
+    uint32_t * datain = malloc(N * sizeof(uint32_t));
+    uint32_t * backbuffer = malloc(SIMDBlockSize * sizeof(uint32_t));
+
+    srand(time(NULL));
+
+    for (gap = 1; gap <= 387420489; gap *= 3) {
+        int k;
+        printf(" gap = %u \n", gap);
+
+    /* simulate some random character string, don't care about endiannes */
+        for (k = 0; k < N; ++k) {
+        uint8_t _tmp[4];
+ 
+            _tmp[0] = get_random_char();
+            _tmp[1] = get_random_char();
+            _tmp[2] = get_random_char();
+            _tmp[3] = get_random_char();
+
+            memmove(&datain[k], _tmp, 4);
+        }
+        for (k = 0; k * SIMDBlockSize < N; ++k) {
+            /*
+               First part works for general arrays (sorted or unsorted)
+            */
+            int j;
+               /* we compute the bit width */
+            const uint32_t b = maxbits(datain + k * SIMDBlockSize);
+            /* we read 128 integers at "datain + k * SIMDBlockSize" and
+               write b 128-bit vectors at "buffer" */
+            simdpackwithoutmask(datain + k * SIMDBlockSize, buffer, b);
+            /* we read back b1 128-bit vectors at "buffer" and write 128 integers at backbuffer */
+            simdunpack(buffer, backbuffer, b);/* uncompressed */
+            for (j = 0; j < SIMDBlockSize; ++j) {
+                uint8_t chars_back[4];
+                uint8_t chars_in[4];
+
+                memmove(chars_back, &backbuffer[j], 4);
+                memmove(chars_in, &datain[k * SIMDBlockSize + j], 4);
+
+                if (chars_in[0] != chars_back[0]
+                    || chars_in[1] != chars_back[1]
+                    || chars_in[2] != chars_back[2]
+                    || chars_in[3] != chars_back[3]) {
+                    printf("bug in simdpack\n");
+                    return -2;
+                }
+            }
+
+            {
+                /*
+                 next part assumes that the data is sorted (uses differential coding)
+                */
+                uint32_t offset = 0;
+                /* we compute the bit width */
+                const uint32_t b1 = simdmaxbitsd1(offset,
+                datain + k * SIMDBlockSize);
+                   /* we read 128 integers at "datain + k * SIMDBlockSize" and
+                  write b1 128-bit vectors at "buffer" */
+                   simdpackwithoutmaskd1(offset, datain + k * SIMDBlockSize, buffer,
+                b1);
+                   /* we read back b1 128-bit vectors at "buffer" and write 128 integers at backbuffer */
+                   simdunpackd1(offset, buffer, backbuffer, b1);
+                for (j = 0; j < SIMDBlockSize; ++j) {
+                    uint8_t chars_back[4];
+                    uint8_t chars_in[4];
+
+                    memmove(chars_back, &backbuffer[j], 4);
+                    memmove(chars_in, &datain[k * SIMDBlockSize + j], 4);
+
+                    if (chars_in[0] != chars_back[0]
+                        || chars_in[1] != chars_back[1]
+                        || chars_in[2] != chars_back[2]
+                        || chars_in[3] != chars_back[3]) {
+                        printf("bug in simdpack\n");
+                        return -3;
+                    }
+                }
+                offset = datain[k * SIMDBlockSize + SIMDBlockSize - 1];
+            }
+        }
+    }
+    free(buffer);
+    free(datain);
+    free(backbuffer);
+    printf("Code looks good.\n");
+    return 0;
+}
--- a/cpp/simdcomp_wrapper.c
+++ b/cpp/simdcomp_wrapper.c
@@ -0,0 +1,42 @@
+#include "simdcomp.h"
+#include "simdcomputil.h"
+
+// assumes datain has a size of 128 uint32
+// and that buffer is large enough to host the data.
+size_t compress_sorted(
+        const uint32_t* datain,
+        uint8_t* output,
+        const uint32_t offset) {
+    const uint32_t b = simdmaxbitsd1(offset, datain);
+    *output++ = b;
+    simdpackwithoutmaskd1(offset, datain, (__m128i *) output,  b);
+    return 1 + b * sizeof(__m128i);
+}
+
+// assumes datain has a size of 128 uint32
+// and that buffer is large enough to host the data.
+size_t uncompress_sorted(
+        const uint8_t* compressed_data, 
+        uint32_t* output, 
+        uint32_t offset) {
+    const uint32_t b = *compressed_data++;
+    simdunpackd1(offset, (__m128i *)compressed_data, output, b);
+    return 1 + b * sizeof(__m128i);
+}
+
+size_t compress_unsorted(
+        const uint32_t* datain,
+        uint8_t* output) {
+    const uint32_t b = maxbits(datain);
+    *output++ = b;
+    simdpackwithoutmask(datain, (__m128i *) output,  b);
+    return 1 + b * sizeof(__m128i);
+}
+
+size_t uncompress_unsorted(
+        const uint8_t* compressed_data, 
+        uint32_t* output) {
+    const uint32_t b = *compressed_data++;
+    simdunpack((__m128i *)compressed_data, output, b);
+    return 1 + b * sizeof(__m128i);
+}
--- a/cpp/streamvbyte/.gitignore
+++ b/cpp/streamvbyte/.gitignore
@@ -0,0 +1,32 @@
+# Object files
+*.o
+*.ko
+*.obj
+*.elf
+
+# Precompiled Headers
+*.gch
+*.pch
+
+# Libraries
+*.lib
+*.a
+*.la
+*.lo
+
+# Shared objects (inc. Windows DLLs)
+*.dll
+*.so
+*.so.*
+*.dylib
+
+# Executables
+*.exe
+*.out
+*.app
+*.i*86
+*.x86_64
+*.hex
+
+# Debug files
+*.dSYM/
--- a/cpp/streamvbyte/.travis.yml
+++ b/cpp/streamvbyte/.travis.yml
@@ -0,0 +1,7 @@
+language: c
+sudo: false
+compiler:
+  - gcc
+  - clang
+
+script: make && ./unit
--- a/cpp/streamvbyte/LICENSE
+++ b/cpp/streamvbyte/LICENSE
@@ -0,0 +1,202 @@
+Apache License
+                           Version 2.0, January 2004
+                        http://www.apache.org/licenses/
+
+   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+
+   1. Definitions.
+
+      "License" shall mean the terms and conditions for use, reproduction,
+      and distribution as defined by Sections 1 through 9 of this document.
+
+      "Licensor" shall mean the copyright owner or entity authorized by
+      the copyright owner that is granting the License.
+
+      "Legal Entity" shall mean the union of the acting entity and all
+      other entities that control, are controlled by, or are under common
+      control with that entity. For the purposes of this definition,
+      "control" means (i) the power, direct or indirect, to cause the
+      direction or management of such entity, whether by contract or
+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
+      outstanding shares, or (iii) beneficial ownership of such entity.
+
+      "You" (or "Your") shall mean an individual or Legal Entity
+      exercising permissions granted by this License.
+
+      "Source" form shall mean the preferred form for making modifications,
+      including but not limited to software source code, documentation
+      source, and configuration files.
+
+      "Object" form shall mean any form resulting from mechanical
+      transformation or translation of a Source form, including but
+      not limited to compiled object code, generated documentation,
+      and conversions to other media types.
+
+      "Work" shall mean the work of authorship, whether in Source or
+      Object form, made available under the License, as indicated by a
+      copyright notice that is included in or attached to the work
+      (an example is provided in the Appendix below).
+
+      "Derivative Works" shall mean any work, whether in Source or Object
+      form, that is based on (or derived from) the Work and for which the
+      editorial revisions, annotations, elaborations, or other modifications
+      represent, as a whole, an original work of authorship. For the purposes
+      of this License, Derivative Works shall not include works that remain
+      separable from, or merely link (or bind by name) to the interfaces of,
+      the Work and Derivative Works thereof.
+
+      "Contribution" shall mean any work of authorship, including
+      the original version of the Work and any modifications or additions
+      to that Work or Derivative Works thereof, that is intentionally
+      submitted to Licensor for inclusion in the Work by the copyright owner
+      or by an individual or Legal Entity authorized to submit on behalf of
+      the copyright owner. For the purposes of this definition, "submitted"
+      means any form of electronic, verbal, or written communication sent
+      to the Licensor or its representatives, including but not limited to
+      communication on electronic mailing lists, source code control systems,
+      and issue tracking systems that are managed by, or on behalf of, the
+      Licensor for the purpose of discussing and improving the Work, but
+      excluding communication that is conspicuously marked or otherwise
+      designated in writing by the copyright owner as "Not a Contribution."
+
+      "Contributor" shall mean Licensor and any individual or Legal Entity
+      on behalf of whom a Contribution has been received by Licensor and
+      subsequently incorporated within the Work.
+
+   2. Grant of Copyright License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      copyright license to reproduce, prepare Derivative Works of,
+      publicly display, publicly perform, sublicense, and distribute the
+      Work and such Derivative Works in Source or Object form.
+
+   3. Grant of Patent License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      (except as stated in this section) patent license to make, have made,
+      use, offer to sell, sell, import, and otherwise transfer the Work,
+      where such license applies only to those patent claims licensable
+      by such Contributor that are necessarily infringed by their
+      Contribution(s) alone or by combination of their Contribution(s)
+      with the Work to which such Contribution(s) was submitted. If You
+      institute patent litigation against any entity (including a
+      cross-claim or counterclaim in a lawsuit) alleging that the Work
+      or a Contribution incorporated within the Work constitutes direct
+      or contributory patent infringement, then any patent licenses
+      granted to You under this License for that Work shall terminate
+      as of the date such litigation is filed.
+
+   4. Redistribution. You may reproduce and distribute copies of the
+      Work or Derivative Works thereof in any medium, with or without
+      modifications, and in Source or Object form, provided that You
+      meet the following conditions:
+
+      (a) You must give any other recipients of the Work or
+          Derivative Works a copy of this License; and
+
+      (b) You must cause any modified files to carry prominent notices
+          stating that You changed the files; and
+
+      (c) You must retain, in the Source form of any Derivative Works
+          that You distribute, all copyright, patent, trademark, and
+          attribution notices from the Source form of the Work,
+          excluding those notices that do not pertain to any part of
+          the Derivative Works; and
+
+      (d) If the Work includes a "NOTICE" text file as part of its
+          distribution, then any Derivative Works that You distribute must
+          include a readable copy of the attribution notices contained
+          within such NOTICE file, excluding those notices that do not
+          pertain to any part of the Derivative Works, in at least one
+          of the following places: within a NOTICE text file distributed
+          as part of the Derivative Works; within the Source form or
+          documentation, if provided along with the Derivative Works; or,
+          within a display generated by the Derivative Works, if and
+          wherever such third-party notices normally appear. The contents
+          of the NOTICE file are for informational purposes only and
+          do not modify the License. You may add Your own attribution
+          notices within Derivative Works that You distribute, alongside
+          or as an addendum to the NOTICE text from the Work, provided
+          that such additional attribution notices cannot be construed
+          as modifying the License.
+
+      You may add Your own copyright statement to Your modifications and
+      may provide additional or different license terms and conditions
+      for use, reproduction, or distribution of Your modifications, or
+      for any such Derivative Works as a whole, provided Your use,
+      reproduction, and distribution of the Work otherwise complies with
+      the conditions stated in this License.
+
+   5. Submission of Contributions. Unless You explicitly state otherwise,
+      any Contribution intentionally submitted for inclusion in the Work
+      by You to the Licensor shall be under the terms and conditions of
+      this License, without any additional terms or conditions.
+      Notwithstanding the above, nothing herein shall supersede or modify
+      the terms of any separate license agreement you may have executed
+      with Licensor regarding such Contributions.
+
+   6. Trademarks. This License does not grant permission to use the trade
+      names, trademarks, service marks, or product names of the Licensor,
+      except as required for reasonable and customary use in describing the
+      origin of the Work and reproducing the content of the NOTICE file.
+
+   7. Disclaimer of Warranty. Unless required by applicable law or
+      agreed to in writing, Licensor provides the Work (and each
+      Contributor provides its Contributions) on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied, including, without limitation, any warranties or conditions
+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+      PARTICULAR PURPOSE. You are solely responsible for determining the
+      appropriateness of using or redistributing the Work and assume any
+      risks associated with Your exercise of permissions under this License.
+
+   8. Limitation of Liability. In no event and under no legal theory,
+      whether in tort (including negligence), contract, or otherwise,
+      unless required by applicable law (such as deliberate and grossly
+      negligent acts) or agreed to in writing, shall any Contributor be
+      liable to You for damages, including any direct, indirect, special,
+      incidental, or consequential damages of any character arising as a
+      result of this License or out of the use or inability to use the
+      Work (including but not limited to damages for loss of goodwill,
+      work stoppage, computer failure or malfunction, or any and all
+      other commercial damages or losses), even if such Contributor
+      has been advised of the possibility of such damages.
+
+   9. Accepting Warranty or Additional Liability. While redistributing
+      the Work or Derivative Works thereof, You may choose to offer,
+      and charge a fee for, acceptance of support, warranty, indemnity,
+      or other liability obligations and/or rights consistent with this
+      License. However, in accepting such obligations, You may act only
+      on Your own behalf and on Your sole responsibility, not on behalf
+      of any other Contributor, and only if You agree to indemnify,
+      defend, and hold each Contributor harmless for any liability
+      incurred by, or claims asserted against, such Contributor by reason
+      of your accepting any such warranty or additional liability.
+
+   END OF TERMS AND CONDITIONS
+
+   APPENDIX: How to apply the Apache License to your work.
+
+      To apply the Apache License to your work, attach the following
+      boilerplate notice, with the fields enclosed by brackets "{}"
+      replaced with your own identifying information. (Don't include
+      the brackets!)  The text should be enclosed in the appropriate
+      comment syntax for the file format. We also recommend that a
+      file or class name and description of purpose be included on the
+      same "printed page" as the copyright notice for easier
+      identification within third-party archives.
+
+   Copyright {yyyy} {name of copyright owner}
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+
--- a/cpp/streamvbyte/README.md
+++ b/cpp/streamvbyte/README.md
@@ -0,0 +1,60 @@
+streamvbyte
+===========
+[![Build Status](https://travis-ci.org/lemire/streamvbyte.png)](https://travis-ci.org/lemire/streamvbyte)
+
+StreamVByte is a new integer compression technique that applies SIMD instructions (vectorization) to
+Google's Group Varint approach. The net result is faster than other byte-oriented compression
+techniques.
+
+The approach is patent-free, the code is available under the Apache License.
+
+
+It includes fast differential coding.
+
+It assumes a recent Intel processor (e.g., haswell or better) .
+
+The code should build using most standard-compliant C99 compilers. The provided makefile
+expects a Linux-like system.
+
+
+Usage:
+
+      make
+      ./unit
+
+See example.c for an example.
+
+Short code sample:
+```C
+// suppose that datain is an array of uint32_t integers
+size_t compsize = streamvbyte_encode(datain, N, compressedbuffer); // encoding
+// here the result is stored in compressedbuffer using compsize bytes
+streamvbyte_decode(compressedbuffer, recovdata, N); // decoding (fast)
+```
+
+If the values are sorted, then it might be preferable to use differential coding:
+```C
+// suppose that datain is an array of uint32_t integers
+size_t compsize = streamvbyte_delta_encode(datain, N, compressedbuffer,0); // encoding
+// here the result is stored in compressedbuffer using compsize bytes
+streamvbyte_delta_decode(compressedbuffer, recovdata, N,0); // decoding (fast)
+```
+You have to know how many integers were coded when you decompress. You can store this 
+information along with the compressed stream.
+
+See also
+--------
+* SIMDCompressionAndIntersection: A C++ library to compress and intersect sorted lists of integers using SIMD instructions https://github.com/lemire/SIMDCompressionAndIntersect
+* The FastPFOR C++ library : Fast integer compression https://github.com/lemire/FastPFor
+* High-performance dictionary coding https://github.com/lemire/dictionary
+* LittleIntPacker: C library to pack and unpack short arrays of integers as fast as possible https://github.com/lemire/LittleIntPacker
+* The SIMDComp library: A simple C library for compressing lists of integers using binary packing https://github.com/lemire/simdcomp
+* MaskedVByte: Fast decoder for VByte-compressed integers https://github.com/lemire/MaskedVByte
+* CSharpFastPFOR: A C#  integer compression library  https://github.com/Genbox/CSharpFastPFOR
+* JavaFastPFOR: A java integer compression library https://github.com/lemire/JavaFastPFOR
+* Encoding: Integer Compression Libraries for Go https://github.com/zhenjl/encoding
+* FrameOfReference is a C++ library dedicated to frame-of-reference (FOR) compression: https://github.com/lemire/FrameOfReference
+* libvbyte: A fast implementation for varbyte 32bit/64bit integer compression https://github.com/cruppstahl/libvbyte
+* TurboPFor is a C library that offers lots of interesting optimizations. Well worth checking! (GPL license) https://github.com/powturbo/TurboPFor
+* Oroch is a C++ library that offers a usable API (MIT license) https://github.com/ademakov/Oroch
+
--- a/cpp/streamvbyte/example.c
+++ b/cpp/streamvbyte/example.c
@@ -0,0 +1,24 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <assert.h>
+
+#include "streamvbyte.h"
+
+int main() {
+	int N = 5000;
+	uint32_t * datain = malloc(N * sizeof(uint32_t));
+	uint8_t * compressedbuffer = malloc(N * sizeof(uint32_t));
+	uint32_t * recovdata = malloc(N * sizeof(uint32_t));
+	for (int k = 0; k < N; ++k)
+		datain[k] = 120;
+	size_t compsize = streamvbyte_encode(datain, N, compressedbuffer); // encoding
+	// here the result is stored in compressedbuffer using compsize bytes
+	size_t compsize2 = streamvbyte_decode(compressedbuffer, recovdata,
+					N); // decoding (fast)
+	assert(compsize == compsize2);
+	free(datain);
+	free(compressedbuffer);
+	free(recovdata);
+	printf("Compressed %d integers down to %d bytes.\n",N,(int) compsize);
+	return 0;
+}
--- a/cpp/streamvbyte/include/streamvbyte.h
+++ b/cpp/streamvbyte/include/streamvbyte.h
@@ -0,0 +1,19 @@
+
+#ifndef VARINTDECODE_H_
+#define VARINTDECODE_H_
+#define __STDC_FORMAT_MACROS
+#include <inttypes.h>
+#include <stdint.h>// please use a C99-compatible compiler
+#include <stddef.h>
+
+
+// Encode an array of a given length read from in to bout in varint format.
+// Returns the number of bytes written.
+size_t streamvbyte_encode(const uint32_t *in, uint32_t length, uint8_t *out);
+
+// Read "length" 32-bit integers in varint format from in, storing the result in out.
+// Returns the number of bytes read.
+size_t streamvbyte_decode(const uint8_t* in, uint32_t* out, uint32_t length);
+
+
+#endif /* VARINTDECODE_H_ */
--- a/cpp/streamvbyte/include/streamvbytedelta.h
+++ b/cpp/streamvbyte/include/streamvbytedelta.h
@@ -0,0 +1,24 @@
+/*
+ * streamvbytedelta.h
+ *
+ *  Created on: Apr 14, 2016
+ *      Author: lemire
+ */
+
+#ifndef INCLUDE_STREAMVBYTEDELTA_H_
+#define INCLUDE_STREAMVBYTEDELTA_H_
+
+
+// Encode an array of a given length read from in to bout in StreamVByte format.
+// Returns the number of bytes written.
+// this version uses differential coding (coding differences between values) starting at prev (you can often set prev to zero)
+size_t streamvbyte_delta_encode(const uint32_t *in, uint32_t length, uint8_t *out, uint32_t  prev);
+
+// Read "length" 32-bit integers in StreamVByte format from in, storing the result in out.
+// Returns the number of bytes read.
+// this version uses differential coding (coding differences between values) starting at prev (you can often set prev to zero)
+size_t streamvbyte_delta_decode(const uint8_t* in, uint32_t* out, uint32_t length, uint32_t  prev);
+
+
+
+#endif /* INCLUDE_STREAMVBYTEDELTA_H_ */
--- a/cpp/streamvbyte/makefile
+++ b/cpp/streamvbyte/makefile
@@ -0,0 +1,58 @@
+# minimalist makefile
+.SUFFIXES:
+#
+.SUFFIXES: .cpp .o .c .h
+
+CFLAGS = -fPIC -march=native -std=c99 -O3 -Wall -Wextra -pedantic -Wshadow
+LDFLAGS = -shared
+LIBNAME=libstreamvbyte.so.0.0.1
+all:  unit $(LIBNAME)
+test:
+	./unit
+install: $(OBJECTS)
+	cp $(LIBNAME) /usr/local/lib
+	ln -s /usr/local/lib/$(LIBNAME) /usr/local/lib/libstreamvbyte.so
+	ldconfig
+	cp $(HEADERS) /usr/local/include
+
+
+
+HEADERS=./include/streamvbyte.h ./include/streamvbytedelta.h 
+
+uninstall:
+	for h in $(HEADERS) ; do rm  /usr/local/$$h; done
+	rm  /usr/local/lib/$(LIBNAME)
+	rm /usr/local/lib/libstreamvbyte.so
+	ldconfig
+
+
+OBJECTS= streamvbyte.o streamvbytedelta.o
+
+
+
+streamvbytedelta.o: ./src/streamvbytedelta.c $(HEADERS)
+	$(CC) $(CFLAGS) -c ./src/streamvbytedelta.c -Iinclude
+
+
+streamvbyte.o: ./src/streamvbyte.c $(HEADERS)
+	$(CC) $(CFLAGS) -c ./src/streamvbyte.c -Iinclude
+
+
+
+$(LIBNAME): $(OBJECTS)
+	$(CC) $(CFLAGS) -o $(LIBNAME) $(OBJECTS)  $(LDFLAGS)
+
+
+
+
+example: ./example.c    $(HEADERS) $(OBJECTS)
+	$(CC) $(CFLAGS) -o example ./example.c -Iinclude  $(OBJECTS)
+
+unit: ./tests/unit.c    $(HEADERS) $(OBJECTS)
+	$(CC) $(CFLAGS) -o unit ./tests/unit.c -Iinclude  $(OBJECTS)
+
+dynunit: ./tests/unit.c    $(HEADERS) $(LIBNAME)
+	$(CC) $(CFLAGS) -o dynunit ./tests/unit.c -Iinclude  -lstreamvbyte
+
+clean:
+	rm -f unit *.o $(LIBNAME) example
--- a/cpp/streamvbyte/src/streamvbyte.c
+++ b/cpp/streamvbyte/src/streamvbyte.c
@@ -0,0 +1,495 @@
+#include "streamvbyte.h"
+#if defined(_MSC_VER)
+     /* Microsoft C/C++-compatible compiler */
+     #include <intrin.h>
+#elif defined(__GNUC__) && (defined(__x86_64__) || defined(__i386__))
+     /* GCC-compatible compiler, targeting x86/x86-64 */
+     #include <x86intrin.h>
+#elif defined(__GNUC__) && defined(__ARM_NEON__)
+     /* GCC-compatible compiler, targeting ARM with NEON */
+     #include <arm_neon.h>
+#elif defined(__GNUC__) && defined(__IWMMXT__)
+     /* GCC-compatible compiler, targeting ARM with WMMX */
+     #include <mmintrin.h>
+#elif (defined(__GNUC__) || defined(__xlC__)) && (defined(__VEC__) || defined(__ALTIVEC__))
+     /* XLC or GCC-compatible compiler, targeting PowerPC with VMX/VSX */
+     #include <altivec.h>
+#elif defined(__GNUC__) && defined(__SPE__)
+     /* GCC-compatible compiler, targeting PowerPC with SPE */
+     #include <spe.h>
+#endif
+
+static uint8_t lengthTable[256] = { 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9,
+		10, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10, 8, 9, 10, 11, 6, 7, 8, 9, 7, 8,
+		9, 10, 8, 9, 10, 11, 9, 10, 11, 12, 7, 8, 9, 10, 8, 9, 10, 11, 9, 10,
+		11, 12, 10, 11, 12, 13, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10, 8, 9, 10,
+		11, 6, 7, 8, 9, 7, 8, 9, 10, 8, 9, 10, 11, 9, 10, 11, 12, 7, 8, 9, 10,
+		8, 9, 10, 11, 9, 10, 11, 12, 10, 11, 12, 13, 8, 9, 10, 11, 9, 10, 11,
+		12, 10, 11, 12, 13, 11, 12, 13, 14, 6, 7, 8, 9, 7, 8, 9, 10, 8, 9, 10,
+		11, 9, 10, 11, 12, 7, 8, 9, 10, 8, 9, 10, 11, 9, 10, 11, 12, 10, 11, 12,
+		13, 8, 9, 10, 11, 9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14, 9, 10,
+		11, 12, 10, 11, 12, 13, 11, 12, 13, 14, 12, 13, 14, 15, 7, 8, 9, 10, 8,
+		9, 10, 11, 9, 10, 11, 12, 10, 11, 12, 13, 8, 9, 10, 11, 9, 10, 11, 12,
+		10, 11, 12, 13, 11, 12, 13, 14, 9, 10, 11, 12, 10, 11, 12, 13, 11, 12,
+		13, 14, 12, 13, 14, 15, 10, 11, 12, 13, 11, 12, 13, 14, 12, 13, 14, 15,
+		13, 14, 15, 16 };
+
+static uint8_t shuffleTable[256][16] = { { 0, -1, -1, -1, 1, -1, -1, -1, 2, -1,
+		-1, -1, 3, -1, -1, -1 }, // 1111
+		{ 0, 1, -1, -1, 2, -1, -1, -1, 3, -1, -1, -1, 4, -1, -1, -1 },  // 2111
+		{ 0, 1, 2, -1, 3, -1, -1, -1, 4, -1, -1, -1, 5, -1, -1, -1 },   // 3111
+		{ 0, 1, 2, 3, 4, -1, -1, -1, 5, -1, -1, -1, 6, -1, -1, -1 },    // 4111
+		{ 0, -1, -1, -1, 1, 2, -1, -1, 3, -1, -1, -1, 4, -1, -1, -1 },  // 1211
+		{ 0, 1, -1, -1, 2, 3, -1, -1, 4, -1, -1, -1, 5, -1, -1, -1 },   // 2211
+		{ 0, 1, 2, -1, 3, 4, -1, -1, 5, -1, -1, -1, 6, -1, -1, -1 },    // 3211
+		{ 0, 1, 2, 3, 4, 5, -1, -1, 6, -1, -1, -1, 7, -1, -1, -1 },     // 4211
+		{ 0, -1, -1, -1, 1, 2, 3, -1, 4, -1, -1, -1, 5, -1, -1, -1 },   // 1311
+		{ 0, 1, -1, -1, 2, 3, 4, -1, 5, -1, -1, -1, 6, -1, -1, -1 },    // 2311
+		{ 0, 1, 2, -1, 3, 4, 5, -1, 6, -1, -1, -1, 7, -1, -1, -1 },     // 3311
+		{ 0, 1, 2, 3, 4, 5, 6, -1, 7, -1, -1, -1, 8, -1, -1, -1 },      // 4311
+		{ 0, -1, -1, -1, 1, 2, 3, 4, 5, -1, -1, -1, 6, -1, -1, -1 },    // 1411
+		{ 0, 1, -1, -1, 2, 3, 4, 5, 6, -1, -1, -1, 7, -1, -1, -1 },     // 2411
+		{ 0, 1, 2, -1, 3, 4, 5, 6, 7, -1, -1, -1, 8, -1, -1, -1 },      // 3411
+		{ 0, 1, 2, 3, 4, 5, 6, 7, 8, -1, -1, -1, 9, -1, -1, -1 },       // 4411
+		{ 0, -1, -1, -1, 1, -1, -1, -1, 2, 3, -1, -1, 4, -1, -1, -1 },  // 1121
+		{ 0, 1, -1, -1, 2, -1, -1, -1, 3, 4, -1, -1, 5, -1, -1, -1 },   // 2121
+		{ 0, 1, 2, -1, 3, -1, -1, -1, 4, 5, -1, -1, 6, -1, -1, -1 },    // 3121
+		{ 0, 1, 2, 3, 4, -1, -1, -1, 5, 6, -1, -1, 7, -1, -1, -1 },     // 4121
+		{ 0, -1, -1, -1, 1, 2, -1, -1, 3, 4, -1, -1, 5, -1, -1, -1 },   // 1221
+		{ 0, 1, -1, -1, 2, 3, -1, -1, 4, 5, -1, -1, 6, -1, -1, -1 },    // 2221
+		{ 0, 1, 2, -1, 3, 4, -1, -1, 5, 6, -1, -1, 7, -1, -1, -1 },     // 3221
+		{ 0, 1, 2, 3, 4, 5, -1, -1, 6, 7, -1, -1, 8, -1, -1, -1 },      // 4221
+		{ 0, -1, -1, -1, 1, 2, 3, -1, 4, 5, -1, -1, 6, -1, -1, -1 },    // 1321
+		{ 0, 1, -1, -1, 2, 3, 4, -1, 5, 6, -1, -1, 7, -1, -1, -1 },     // 2321
+		{ 0, 1, 2, -1, 3, 4, 5, -1, 6, 7, -1, -1, 8, -1, -1, -1 },      // 3321
+		{ 0, 1, 2, 3, 4, 5, 6, -1, 7, 8, -1, -1, 9, -1, -1, -1 },       // 4321
+		{ 0, -1, -1, -1, 1, 2, 3, 4, 5, 6, -1, -1, 7, -1, -1, -1 },     // 1421
+		{ 0, 1, -1, -1, 2, 3, 4, 5, 6, 7, -1, -1, 8, -1, -1, -1 },      // 2421
+		{ 0, 1, 2, -1, 3, 4, 5, 6, 7, 8, -1, -1, 9, -1, -1, -1 },       // 3421
+		{ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, -1, -1, 10, -1, -1, -1 },       // 4421
+		{ 0, -1, -1, -1, 1, -1, -1, -1, 2, 3, 4, -1, 5, -1, -1, -1 },   // 1131
+		{ 0, 1, -1, -1, 2, -1, -1, -1, 3, 4, 5, -1, 6, -1, -1, -1 },    // 2131
+		{ 0, 1, 2, -1, 3, -1, -1, -1, 4, 5, 6, -1, 7, -1, -1, -1 },     // 3131
+		{ 0, 1, 2, 3, 4, -1, -1, -1, 5, 6, 7, -1, 8, -1, -1, -1 },      // 4131
+		{ 0, -1, -1, -1, 1, 2, -1, -1, 3, 4, 5, -1, 6, -1, -1, -1 },    // 1231
+		{ 0, 1, -1, -1, 2, 3, -1, -1, 4, 5, 6, -1, 7, -1, -1, -1 },     // 2231
+		{ 0, 1, 2, -1, 3, 4, -1, -1, 5, 6, 7, -1, 8, -1, -1, -1 },      // 3231
+		{ 0, 1, 2, 3, 4, 5, -1, -1, 6, 7, 8, -1, 9, -1, -1, -1 },       // 4231
+		{ 0, -1, -1, -1, 1, 2, 3, -1, 4, 5, 6, -1, 7, -1, -1, -1 },     // 1331
+		{ 0, 1, -1, -1, 2, 3, 4, -1, 5, 6, 7, -1, 8, -1, -1, -1 },      // 2331
+		{ 0, 1, 2, -1, 3, 4, 5, -1, 6, 7, 8, -1, 9, -1, -1, -1 },       // 3331
+		{ 0, 1, 2, 3, 4, 5, 6, -1, 7, 8, 9, -1, 10, -1, -1, -1 },       // 4331
+		{ 0, -1, -1, -1, 1, 2, 3, 4, 5, 6, 7, -1, 8, -1, -1, -1 },      // 1431
+		{ 0, 1, -1, -1, 2, 3, 4, 5, 6, 7, 8, -1, 9, -1, -1, -1 },       // 2431
+		{ 0, 1, 2, -1, 3, 4, 5, 6, 7, 8, 9, -1, 10, -1, -1, -1 },       // 3431
+		{ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, -1, 11, -1, -1, -1 },       // 4431
+		{ 0, -1, -1, -1, 1, -1, -1, -1, 2, 3, 4, 5, 6, -1, -1, -1 },    // 1141
+		{ 0, 1, -1, -1, 2, -1, -1, -1, 3, 4, 5, 6, 7, -1, -1, -1 },     // 2141
+		{ 0, 1, 2, -1, 3, -1, -1, -1, 4, 5, 6, 7, 8, -1, -1, -1 },      // 3141
+		{ 0, 1, 2, 3, 4, -1, -1, -1, 5, 6, 7, 8, 9, -1, -1, -1 },       // 4141
+		{ 0, -1, -1, -1, 1, 2, -1, -1, 3, 4, 5, 6, 7, -1, -1, -1 },     // 1241
+		{ 0, 1, -1, -1, 2, 3, -1, -1, 4, 5, 6, 7, 8, -1, -1, -1 },      // 2241
+		{ 0, 1, 2, -1, 3, 4, -1, -1, 5, 6, 7, 8, 9, -1, -1, -1 },       // 3241
+		{ 0, 1, 2, 3, 4, 5, -1, -1, 6, 7, 8, 9, 10, -1, -1, -1 },       // 4241
+		{ 0, -1, -1, -1, 1, 2, 3, -1, 4, 5, 6, 7, 8, -1, -1, -1 },      // 1341
+		{ 0, 1, -1, -1, 2, 3, 4, -1, 5, 6, 7, 8, 9, -1, -1, -1 },       // 2341
+		{ 0, 1, 2, -1, 3, 4, 5, -1, 6, 7, 8, 9, 10, -1, -1, -1 },       // 3341
+		{ 0, 1, 2, 3, 4, 5, 6, -1, 7, 8, 9, 10, 11, -1, -1, -1 },       // 4341
+		{ 0, -1, -1, -1, 1, 2, 3, 4, 5, 6, 7, 8, 9, -1, -1, -1 },       // 1441
+		{ 0, 1, -1, -1, 2, 3, 4, 5, 6, 7, 8, 9, 10, -1, -1, -1 },       // 2441
+		{ 0, 1, 2, -1, 3, 4, 5, 6, 7, 8, 9, 10, 11, -1, -1, -1 },       // 3441
+		{ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, -1, -1, -1 },       // 4441
+		{ 0, -1, -1, -1, 1, -1, -1, -1, 2, -1, -1, -1, 3, 4, -1, -1 },  // 1112
+		{ 0, 1, -1, -1, 2, -1, -1, -1, 3, -1, -1, -1, 4, 5, -1, -1 },   // 2112
+		{ 0, 1, 2, -1, 3, -1, -1, -1, 4, -1, -1, -1, 5, 6, -1, -1 },    // 3112
+		{ 0, 1, 2, 3, 4, -1, -1, -1, 5, -1, -1, -1, 6, 7, -1, -1 },     // 4112
+		{ 0, -1, -1, -1, 1, 2, -1, -1, 3, -1, -1, -1, 4, 5, -1, -1 },   // 1212
+		{ 0, 1, -1, -1, 2, 3, -1, -1, 4, -1, -1, -1, 5, 6, -1, -1 },    // 2212
+		{ 0, 1, 2, -1, 3, 4, -1, -1, 5, -1, -1, -1, 6, 7, -1, -1 },     // 3212
+		{ 0, 1, 2, 3, 4, 5, -1, -1, 6, -1, -1, -1, 7, 8, -1, -1 },      // 4212
+		{ 0, -1, -1, -1, 1, 2, 3, -1, 4, -1, -1, -1, 5, 6, -1, -1 },    // 1312
+		{ 0, 1, -1, -1, 2, 3, 4, -1, 5, -1, -1, -1, 6, 7, -1, -1 },     // 2312
+		{ 0, 1, 2, -1, 3, 4, 5, -1, 6, -1, -1, -1, 7, 8, -1, -1 },      // 3312
+		{ 0, 1, 2, 3, 4, 5, 6, -1, 7, -1, -1, -1, 8, 9, -1, -1 },       // 4312
+		{ 0, -1, -1, -1, 1, 2, 3, 4, 5, -1, -1, -1, 6, 7, -1, -1 },     // 1412
+		{ 0, 1, -1, -1, 2, 3, 4, 5, 6, -1, -1, -1, 7, 8, -1, -1 },      // 2412
+		{ 0, 1, 2, -1, 3, 4, 5, 6, 7, -1, -1, -1, 8, 9, -1, -1 },       // 3412
+		{ 0, 1, 2, 3, 4, 5, 6, 7, 8, -1, -1, -1, 9, 10, -1, -1 },       // 4412
+		{ 0, -1, -1, -1, 1, -1, -1, -1, 2, 3, -1, -1, 4, 5, -1, -1 },   // 1122
+		{ 0, 1, -1, -1, 2, -1, -1, -1, 3, 4, -1, -1, 5, 6, -1, -1 },    // 2122
+		{ 0, 1, 2, -1, 3, -1, -1, -1, 4, 5, -1, -1, 6, 7, -1, -1 },     // 3122
+		{ 0, 1, 2, 3, 4, -1, -1, -1, 5, 6, -1, -1, 7, 8, -1, -1 },      // 4122
+		{ 0, -1, -1, -1, 1, 2, -1, -1, 3, 4, -1, -1, 5, 6, -1, -1 },    // 1222
+		{ 0, 1, -1, -1, 2, 3, -1, -1, 4, 5, -1, -1, 6, 7, -1, -1 },     // 2222
+		{ 0, 1, 2, -1, 3, 4, -1, -1, 5, 6, -1, -1, 7, 8, -1, -1 },      // 3222
+		{ 0, 1, 2, 3, 4, 5, -1, -1, 6, 7, -1, -1, 8, 9, -1, -1 },       // 4222
+		{ 0, -1, -1, -1, 1, 2, 3, -1, 4, 5, -1, -1, 6, 7, -1, -1 },     // 1322
+		{ 0, 1, -1, -1, 2, 3, 4, -1, 5, 6, -1, -1, 7, 8, -1, -1 },      // 2322
+		{ 0, 1, 2, -1, 3, 4, 5, -1, 6, 7, -1, -1, 8, 9, -1, -1 },       // 3322
+		{ 0, 1, 2, 3, 4, 5, 6, -1, 7, 8, -1, -1, 9, 10, -1, -1 },       // 4322
+		{ 0, -1, -1, -1, 1, 2, 3, 4, 5, 6, -1, -1, 7, 8, -1, -1 },      // 1422
+		{ 0, 1, -1, -1, 2, 3, 4, 5, 6, 7, -1, -1, 8, 9, -1, -1 },       // 2422
+		{ 0, 1, 2, -1, 3, 4, 5, 6, 7, 8, -1, -1, 9, 10, -1, -1 },       // 3422
+		{ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, -1, -1, 10, 11, -1, -1 },       // 4422
+		{ 0, -1, -1, -1, 1, -1, -1, -1, 2, 3, 4, -1, 5, 6, -1, -1 },    // 1132
+		{ 0, 1, -1, -1, 2, -1, -1, -1, 3, 4, 5, -1, 6, 7, -1, -1 },     // 2132
+		{ 0, 1, 2, -1, 3, -1, -1, -1, 4, 5, 6, -1, 7, 8, -1, -1 },      // 3132
+		{ 0, 1, 2, 3, 4, -1, -1, -1, 5, 6, 7, -1, 8, 9, -1, -1 },       // 4132
+		{ 0, -1, -1, -1, 1, 2, -1, -1, 3, 4, 5, -1, 6, 7, -1, -1 },     // 1232
+		{ 0, 1, -1, -1, 2, 3, -1, -1, 4, 5, 6, -1, 7, 8, -1, -1 },      // 2232
+		{ 0, 1, 2, -1, 3, 4, -1, -1, 5, 6, 7, -1, 8, 9, -1, -1 },       // 3232
+		{ 0, 1, 2, 3, 4, 5, -1, -1, 6, 7, 8, -1, 9, 10, -1, -1 },       // 4232
+		{ 0, -1, -1, -1, 1, 2, 3, -1, 4, 5, 6, -1, 7, 8, -1, -1 },      // 1332
+		{ 0, 1, -1, -1, 2, 3, 4, -1, 5, 6, 7, -1, 8, 9, -1, -1 },       // 2332
+		{ 0, 1, 2, -1, 3, 4, 5, -1, 6, 7, 8, -1, 9, 10, -1, -1 },       // 3332
+		{ 0, 1, 2, 3, 4, 5, 6, -1, 7, 8, 9, -1, 10, 11, -1, -1 },       // 4332
+		{ 0, -1, -1, -1, 1, 2, 3, 4, 5, 6, 7, -1, 8, 9, -1, -1 },       // 1432
+		{ 0, 1, -1, -1, 2, 3, 4, 5, 6, 7, 8, -1, 9, 10, -1, -1 },       // 2432
+		{ 0, 1, 2, -1, 3, 4, 5, 6, 7, 8, 9, -1, 10, 11, -1, -1 },       // 3432
+		{ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, -1, 11, 12, -1, -1 },       // 4432
+		{ 0, -1, -1, -1, 1, -1, -1, -1, 2, 3, 4, 5, 6, 7, -1, -1 },     // 1142
+		{ 0, 1, -1, -1, 2, -1, -1, -1, 3, 4, 5, 6, 7, 8, -1, -1 },      // 2142
+		{ 0, 1, 2, -1, 3, -1, -1, -1, 4, 5, 6, 7, 8, 9, -1, -1 },       // 3142
+		{ 0, 1, 2, 3, 4, -1, -1, -1, 5, 6, 7, 8, 9, 10, -1, -1 },       // 4142
+		{ 0, -1, -1, -1, 1, 2, -1, -1, 3, 4, 5, 6, 7, 8, -1, -1 },      // 1242
+		{ 0, 1, -1, -1, 2, 3, -1, -1, 4, 5, 6, 7, 8, 9, -1, -1 },       // 2242
+		{ 0, 1, 2, -1, 3, 4, -1, -1, 5, 6, 7, 8, 9, 10, -1, -1 },       // 3242
+		{ 0, 1, 2, 3, 4, 5, -1, -1, 6, 7, 8, 9, 10, 11, -1, -1 },       // 4242
+		{ 0, -1, -1, -1, 1, 2, 3, -1, 4, 5, 6, 7, 8, 9, -1, -1 },       // 1342
+		{ 0, 1, -1, -1, 2, 3, 4, -1, 5, 6, 7, 8, 9, 10, -1, -1 },       // 2342
+		{ 0, 1, 2, -1, 3, 4, 5, -1, 6, 7, 8, 9, 10, 11, -1, -1 },       // 3342
+		{ 0, 1, 2, 3, 4, 5, 6, -1, 7, 8, 9, 10, 11, 12, -1, -1 },       // 4342
+		{ 0, -1, -1, -1, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, -1, -1 },       // 1442
+		{ 0, 1, -1, -1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, -1, -1 },       // 2442
+		{ 0, 1, 2, -1, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, -1, -1 },       // 3442
+		{ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, -1, -1 },       // 4442
+		{ 0, -1, -1, -1, 1, -1, -1, -1, 2, -1, -1, -1, 3, 4, 5, -1 },   // 1113
+		{ 0, 1, -1, -1, 2, -1, -1, -1, 3, -1, -1, -1, 4, 5, 6, -1 },    // 2113
+		{ 0, 1, 2, -1, 3, -1, -1, -1, 4, -1, -1, -1, 5, 6, 7, -1 },     // 3113
+		{ 0, 1, 2, 3, 4, -1, -1, -1, 5, -1, -1, -1, 6, 7, 8, -1 },      // 4113
+		{ 0, -1, -1, -1, 1, 2, -1, -1, 3, -1, -1, -1, 4, 5, 6, -1 },    // 1213
+		{ 0, 1, -1, -1, 2, 3, -1, -1, 4, -1, -1, -1, 5, 6, 7, -1 },     // 2213
+		{ 0, 1, 2, -1, 3, 4, -1, -1, 5, -1, -1, -1, 6, 7, 8, -1 },      // 3213
+		{ 0, 1, 2, 3, 4, 5, -1, -1, 6, -1, -1, -1, 7, 8, 9, -1 },       // 4213
+		{ 0, -1, -1, -1, 1, 2, 3, -1, 4, -1, -1, -1, 5, 6, 7, -1 },     // 1313
+		{ 0, 1, -1, -1, 2, 3, 4, -1, 5, -1, -1, -1, 6, 7, 8, -1 },      // 2313
+		{ 0, 1, 2, -1, 3, 4, 5, -1, 6, -1, -1, -1, 7, 8, 9, -1 },       // 3313
+		{ 0, 1, 2, 3, 4, 5, 6, -1, 7, -1, -1, -1, 8, 9, 10, -1 },       // 4313
+		{ 0, -1, -1, -1, 1, 2, 3, 4, 5, -1, -1, -1, 6, 7, 8, -1 },      // 1413
+		{ 0, 1, -1, -1, 2, 3, 4, 5, 6, -1, -1, -1, 7, 8, 9, -1 },       // 2413
+		{ 0, 1, 2, -1, 3, 4, 5, 6, 7, -1, -1, -1, 8, 9, 10, -1 },       // 3413
+		{ 0, 1, 2, 3, 4, 5, 6, 7, 8, -1, -1, -1, 9, 10, 11, -1 },       // 4413
+		{ 0, -1, -1, -1, 1, -1, -1, -1, 2, 3, -1, -1, 4, 5, 6, -1 },    // 1123
+		{ 0, 1, -1, -1, 2, -1, -1, -1, 3, 4, -1, -1, 5, 6, 7, -1 },     // 2123
+		{ 0, 1, 2, -1, 3, -1, -1, -1, 4, 5, -1, -1, 6, 7, 8, -1 },      // 3123
+		{ 0, 1, 2, 3, 4, -1, -1, -1, 5, 6, -1, -1, 7, 8, 9, -1 },       // 4123
+		{ 0, -1, -1, -1, 1, 2, -1, -1, 3, 4, -1, -1, 5, 6, 7, -1 },     // 1223
+		{ 0, 1, -1, -1, 2, 3, -1, -1, 4, 5, -1, -1, 6, 7, 8, -1 },      // 2223
+		{ 0, 1, 2, -1, 3, 4, -1, -1, 5, 6, -1, -1, 7, 8, 9, -1 },       // 3223
+		{ 0, 1, 2, 3, 4, 5, -1, -1, 6, 7, -1, -1, 8, 9, 10, -1 },       // 4223
+		{ 0, -1, -1, -1, 1, 2, 3, -1, 4, 5, -1, -1, 6, 7, 8, -1 },      // 1323
+		{ 0, 1, -1, -1, 2, 3, 4, -1, 5, 6, -1, -1, 7, 8, 9, -1 },       // 2323
+		{ 0, 1, 2, -1, 3, 4, 5, -1, 6, 7, -1, -1, 8, 9, 10, -1 },       // 3323
+		{ 0, 1, 2, 3, 4, 5, 6, -1, 7, 8, -1, -1, 9, 10, 11, -1 },       // 4323
+		{ 0, -1, -1, -1, 1, 2, 3, 4, 5, 6, -1, -1, 7, 8, 9, -1 },       // 1423
+		{ 0, 1, -1, -1, 2, 3, 4, 5, 6, 7, -1, -1, 8, 9, 10, -1 },       // 2423
+		{ 0, 1, 2, -1, 3, 4, 5, 6, 7, 8, -1, -1, 9, 10, 11, -1 },       // 3423
+		{ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, -1, -1, 10, 11, 12, -1 },       // 4423
+		{ 0, -1, -1, -1, 1, -1, -1, -1, 2, 3, 4, -1, 5, 6, 7, -1 },     // 1133
+		{ 0, 1, -1, -1, 2, -1, -1, -1, 3, 4, 5, -1, 6, 7, 8, -1 },      // 2133
+		{ 0, 1, 2, -1, 3, -1, -1, -1, 4, 5, 6, -1, 7, 8, 9, -1 },       // 3133
+		{ 0, 1, 2, 3, 4, -1, -1, -1, 5, 6, 7, -1, 8, 9, 10, -1 },       // 4133
+		{ 0, -1, -1, -1, 1, 2, -1, -1, 3, 4, 5, -1, 6, 7, 8, -1 },      // 1233
+		{ 0, 1, -1, -1, 2, 3, -1, -1, 4, 5, 6, -1, 7, 8, 9, -1 },       // 2233
+		{ 0, 1, 2, -1, 3, 4, -1, -1, 5, 6, 7, -1, 8, 9, 10, -1 },       // 3233
+		{ 0, 1, 2, 3, 4, 5, -1, -1, 6, 7, 8, -1, 9, 10, 11, -1 },       // 4233
+		{ 0, -1, -1, -1, 1, 2, 3, -1, 4, 5, 6, -1, 7, 8, 9, -1 },       // 1333
+		{ 0, 1, -1, -1, 2, 3, 4, -1, 5, 6, 7, -1, 8, 9, 10, -1 },       // 2333
+		{ 0, 1, 2, -1, 3, 4, 5, -1, 6, 7, 8, -1, 9, 10, 11, -1 },       // 3333
+		{ 0, 1, 2, 3, 4, 5, 6, -1, 7, 8, 9, -1, 10, 11, 12, -1 },       // 4333
+		{ 0, -1, -1, -1, 1, 2, 3, 4, 5, 6, 7, -1, 8, 9, 10, -1 },       // 1433
+		{ 0, 1, -1, -1, 2, 3, 4, 5, 6, 7, 8, -1, 9, 10, 11, -1 },       // 2433
+		{ 0, 1, 2, -1, 3, 4, 5, 6, 7, 8, 9, -1, 10, 11, 12, -1 },       // 3433
+		{ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, -1, 11, 12, 13, -1 },       // 4433
+		{ 0, -1, -1, -1, 1, -1, -1, -1, 2, 3, 4, 5, 6, 7, 8, -1 },      // 1143
+		{ 0, 1, -1, -1, 2, -1, -1, -1, 3, 4, 5, 6, 7, 8, 9, -1 },       // 2143
+		{ 0, 1, 2, -1, 3, -1, -1, -1, 4, 5, 6, 7, 8, 9, 10, -1 },       // 3143
+		{ 0, 1, 2, 3, 4, -1, -1, -1, 5, 6, 7, 8, 9, 10, 11, -1 },       // 4143
+		{ 0, -1, -1, -1, 1, 2, -1, -1, 3, 4, 5, 6, 7, 8, 9, -1 },       // 1243
+		{ 0, 1, -1, -1, 2, 3, -1, -1, 4, 5, 6, 7, 8, 9, 10, -1 },       // 2243
+		{ 0, 1, 2, -1, 3, 4, -1, -1, 5, 6, 7, 8, 9, 10, 11, -1 },       // 3243
+		{ 0, 1, 2, 3, 4, 5, -1, -1, 6, 7, 8, 9, 10, 11, 12, -1 },       // 4243
+		{ 0, -1, -1, -1, 1, 2, 3, -1, 4, 5, 6, 7, 8, 9, 10, -1 },       // 1343
+		{ 0, 1, -1, -1, 2, 3, 4, -1, 5, 6, 7, 8, 9, 10, 11, -1 },       // 2343
+		{ 0, 1, 2, -1, 3, 4, 5, -1, 6, 7, 8, 9, 10, 11, 12, -1 },       // 3343
+		{ 0, 1, 2, 3, 4, 5, 6, -1, 7, 8, 9, 10, 11, 12, 13, -1 },       // 4343
+		{ 0, -1, -1, -1, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, -1 },       // 1443
+		{ 0, 1, -1, -1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, -1 },       // 2443
+		{ 0, 1, 2, -1, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, -1 },       // 3443
+		{ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, -1 },       // 4443
+		{ 0, -1, -1, -1, 1, -1, -1, -1, 2, -1, -1, -1, 3, 4, 5, 6 },    // 1114
+		{ 0, 1, -1, -1, 2, -1, -1, -1, 3, -1, -1, -1, 4, 5, 6, 7 },     // 2114
+		{ 0, 1, 2, -1, 3, -1, -1, -1, 4, -1, -1, -1, 5, 6, 7, 8 },      // 3114
+		{ 0, 1, 2, 3, 4, -1, -1, -1, 5, -1, -1, -1, 6, 7, 8, 9 },       // 4114
+		{ 0, -1, -1, -1, 1, 2, -1, -1, 3, -1, -1, -1, 4, 5, 6, 7 },     // 1214
+		{ 0, 1, -1, -1, 2, 3, -1, -1, 4, -1, -1, -1, 5, 6, 7, 8 },      // 2214
+		{ 0, 1, 2, -1, 3, 4, -1, -1, 5, -1, -1, -1, 6, 7, 8, 9 },       // 3214
+		{ 0, 1, 2, 3, 4, 5, -1, -1, 6, -1, -1, -1, 7, 8, 9, 10 },       // 4214
+		{ 0, -1, -1, -1, 1, 2, 3, -1, 4, -1, -1, -1, 5, 6, 7, 8 },      // 1314
+		{ 0, 1, -1, -1, 2, 3, 4, -1, 5, -1, -1, -1, 6, 7, 8, 9 },       // 2314
+		{ 0, 1, 2, -1, 3, 4, 5, -1, 6, -1, -1, -1, 7, 8, 9, 10 },       // 3314
+		{ 0, 1, 2, 3, 4, 5, 6, -1, 7, -1, -1, -1, 8, 9, 10, 11 },       // 4314
+		{ 0, -1, -1, -1, 1, 2, 3, 4, 5, -1, -1, -1, 6, 7, 8, 9 },       // 1414
+		{ 0, 1, -1, -1, 2, 3, 4, 5, 6, -1, -1, -1, 7, 8, 9, 10 },       // 2414
+		{ 0, 1, 2, -1, 3, 4, 5, 6, 7, -1, -1, -1, 8, 9, 10, 11 },       // 3414
+		{ 0, 1, 2, 3, 4, 5, 6, 7, 8, -1, -1, -1, 9, 10, 11, 12 },       // 4414
+		{ 0, -1, -1, -1, 1, -1, -1, -1, 2, 3, -1, -1, 4, 5, 6, 7 },     // 1124
+		{ 0, 1, -1, -1, 2, -1, -1, -1, 3, 4, -1, -1, 5, 6, 7, 8 },      // 2124
+		{ 0, 1, 2, -1, 3, -1, -1, -1, 4, 5, -1, -1, 6, 7, 8, 9 },       // 3124
+		{ 0, 1, 2, 3, 4, -1, -1, -1, 5, 6, -1, -1, 7, 8, 9, 10 },       // 4124
+		{ 0, -1, -1, -1, 1, 2, -1, -1, 3, 4, -1, -1, 5, 6, 7, 8 },      // 1224
+		{ 0, 1, -1, -1, 2, 3, -1, -1, 4, 5, -1, -1, 6, 7, 8, 9 },       // 2224
+		{ 0, 1, 2, -1, 3, 4, -1, -1, 5, 6, -1, -1, 7, 8, 9, 10 },       // 3224
+		{ 0, 1, 2, 3, 4, 5, -1, -1, 6, 7, -1, -1, 8, 9, 10, 11 },       // 4224
+		{ 0, -1, -1, -1, 1, 2, 3, -1, 4, 5, -1, -1, 6, 7, 8, 9 },       // 1324
+		{ 0, 1, -1, -1, 2, 3, 4, -1, 5, 6, -1, -1, 7, 8, 9, 10 },       // 2324
+		{ 0, 1, 2, -1, 3, 4, 5, -1, 6, 7, -1, -1, 8, 9, 10, 11 },       // 3324
+		{ 0, 1, 2, 3, 4, 5, 6, -1, 7, 8, -1, -1, 9, 10, 11, 12 },       // 4324
+		{ 0, -1, -1, -1, 1, 2, 3, 4, 5, 6, -1, -1, 7, 8, 9, 10 },       // 1424
+		{ 0, 1, -1, -1, 2, 3, 4, 5, 6, 7, -1, -1, 8, 9, 10, 11 },       // 2424
+		{ 0, 1, 2, -1, 3, 4, 5, 6, 7, 8, -1, -1, 9, 10, 11, 12 },       // 3424
+		{ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, -1, -1, 10, 11, 12, 13 },       // 4424
+		{ 0, -1, -1, -1, 1, -1, -1, -1, 2, 3, 4, -1, 5, 6, 7, 8 },      // 1134
+		{ 0, 1, -1, -1, 2, -1, -1, -1, 3, 4, 5, -1, 6, 7, 8, 9 },       // 2134
+		{ 0, 1, 2, -1, 3, -1, -1, -1, 4, 5, 6, -1, 7, 8, 9, 10 },       // 3134
+		{ 0, 1, 2, 3, 4, -1, -1, -1, 5, 6, 7, -1, 8, 9, 10, 11 },       // 4134
+		{ 0, -1, -1, -1, 1, 2, -1, -1, 3, 4, 5, -1, 6, 7, 8, 9 },       // 1234
+		{ 0, 1, -1, -1, 2, 3, -1, -1, 4, 5, 6, -1, 7, 8, 9, 10 },       // 2234
+		{ 0, 1, 2, -1, 3, 4, -1, -1, 5, 6, 7, -1, 8, 9, 10, 11 },       // 3234
+		{ 0, 1, 2, 3, 4, 5, -1, -1, 6, 7, 8, -1, 9, 10, 11, 12 },       // 4234
+		{ 0, -1, -1, -1, 1, 2, 3, -1, 4, 5, 6, -1, 7, 8, 9, 10 },       // 1334
+		{ 0, 1, -1, -1, 2, 3, 4, -1, 5, 6, 7, -1, 8, 9, 10, 11 },       // 2334
+		{ 0, 1, 2, -1, 3, 4, 5, -1, 6, 7, 8, -1, 9, 10, 11, 12 },       // 3334
+		{ 0, 1, 2, 3, 4, 5, 6, -1, 7, 8, 9, -1, 10, 11, 12, 13 },       // 4334
+		{ 0, -1, -1, -1, 1, 2, 3, 4, 5, 6, 7, -1, 8, 9, 10, 11 },       // 1434
+		{ 0, 1, -1, -1, 2, 3, 4, 5, 6, 7, 8, -1, 9, 10, 11, 12 },       // 2434
+		{ 0, 1, 2, -1, 3, 4, 5, 6, 7, 8, 9, -1, 10, 11, 12, 13 },       // 3434
+		{ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, -1, 11, 12, 13, 14 },       // 4434
+		{ 0, -1, -1, -1, 1, -1, -1, -1, 2, 3, 4, 5, 6, 7, 8, 9 },       // 1144
+		{ 0, 1, -1, -1, 2, -1, -1, -1, 3, 4, 5, 6, 7, 8, 9, 10 },       // 2144
+		{ 0, 1, 2, -1, 3, -1, -1, -1, 4, 5, 6, 7, 8, 9, 10, 11 },       // 3144
+		{ 0, 1, 2, 3, 4, -1, -1, -1, 5, 6, 7, 8, 9, 10, 11, 12 },       // 4144
+		{ 0, -1, -1, -1, 1, 2, -1, -1, 3, 4, 5, 6, 7, 8, 9, 10 },       // 1244
+		{ 0, 1, -1, -1, 2, 3, -1, -1, 4, 5, 6, 7, 8, 9, 10, 11 },       // 2244
+		{ 0, 1, 2, -1, 3, 4, -1, -1, 5, 6, 7, 8, 9, 10, 11, 12 },       // 3244
+		{ 0, 1, 2, 3, 4, 5, -1, -1, 6, 7, 8, 9, 10, 11, 12, 13 },       // 4244
+		{ 0, -1, -1, -1, 1, 2, 3, -1, 4, 5, 6, 7, 8, 9, 10, 11 },       // 1344
+		{ 0, 1, -1, -1, 2, 3, 4, -1, 5, 6, 7, 8, 9, 10, 11, 12 },       // 2344
+		{ 0, 1, 2, -1, 3, 4, 5, -1, 6, 7, 8, 9, 10, 11, 12, 13 },       // 3344
+		{ 0, 1, 2, 3, 4, 5, 6, -1, 7, 8, 9, 10, 11, 12, 13, 14 },       // 4344
+		{ 0, -1, -1, -1, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12 },       // 1444
+		{ 0, 1, -1, -1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13 },       // 2444
+		{ 0, 1, 2, -1, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14 },       // 3444
+		{ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 }        // 4444
+};
+
+static uint8_t _encode_data(uint32_t val, uint8_t *__restrict__ *dataPtrPtr) {
+	uint8_t *dataPtr = *dataPtrPtr;
+	uint8_t code;
+
+	if (val < (1 << 8)) { // 1 byte
+		*dataPtr = (uint8_t)(val);
+		*dataPtrPtr += 1;
+		code = 0;
+	} else if (val < (1 << 16)) { // 2 bytes
+		*(uint16_t *) dataPtr = (uint16_t)(val);
+		*dataPtrPtr += 2;
+		code = 1;
+	} else if (val < (1 << 24)) { // 3 bytes
+		*(uint16_t *) dataPtr = (uint16_t)(val);
+		*(dataPtr + 2) = (uint8_t)(val >> 16);
+		*dataPtrPtr += 3;
+		code = 2;
+	} else { // 4 bytes
+		*(uint32_t *) dataPtr = val;
+		*dataPtrPtr += 4;
+		code = 3;
+	}
+
+	return code;
+}
+
+static uint8_t *svb_encode_scalar(const uint32_t *in,
+		uint8_t *__restrict__ keyPtr, uint8_t *__restrict__ dataPtr,
+		uint32_t count) {
+	if (count == 0)
+		return dataPtr; // exit immediately if no data
+
+	uint8_t shift = 0; // cycles 0, 2, 4, 6, 0, 2, 4, 6, ...
+	uint8_t key = 0;
+	for (uint32_t c = 0; c < count; c++) {
+		if (shift == 8) {
+			shift = 0;
+			*keyPtr++ = key;
+			key = 0;
+		}
+		uint32_t val = in[c];
+		uint8_t code = _encode_data(val, &dataPtr);
+		key |= code << shift;
+		shift += 2;
+	}
+
+	*keyPtr = key;  // write last key (no increment needed)
+	return dataPtr; // pointer to first unused data byte
+}
+
+// Encode an array of a given length read from in to bout in streamvbyte format.
+// Returns the number of bytes written.
+size_t streamvbyte_encode(const uint32_t *in, uint32_t count, uint8_t *out) {
+	uint8_t *keyPtr = out;
+	uint32_t keyLen = (count + 3) / 4; // 2-bits rounded to full byte
+	uint8_t *dataPtr = keyPtr + keyLen; // variable byte data after all keys
+	return svb_encode_scalar(in, keyPtr, dataPtr, count) - out;
+}
+
+static inline __m128i _decode_avx(uint32_t key,
+		const uint8_t *__restrict__ *dataPtrPtr) {
+	uint8_t len = lengthTable[key];
+	__m128i Data = _mm_loadu_si128((__m128i *) *dataPtrPtr);
+	__m128i Shuf = *(__m128i *) &shuffleTable[key];
+
+	Data = _mm_shuffle_epi8(Data, Shuf);
+	*dataPtrPtr += len;
+	return Data;
+}
+
+static inline void _write_avx(uint32_t *out, __m128i Vec) {
+	_mm_storeu_si128((__m128i *) out, Vec);
+}
+
+static inline uint32_t _decode_data(const uint8_t **dataPtrPtr, uint8_t code) {
+	const uint8_t *dataPtr = *dataPtrPtr;
+	uint32_t val;
+
+	if (code == 0) { // 1 byte
+		val = (uint32_t) * dataPtr;
+		dataPtr += 1;
+	} else if (code == 1) { // 2 bytes
+		val = (uint32_t) * (uint16_t *) dataPtr;
+		dataPtr += 2;
+	} else if (code == 2) { // 3 bytes
+		val = (uint32_t) * (uint16_t *) dataPtr;
+		val |= *(dataPtr + 2) << 16;
+		dataPtr += 3;
+	} else {                      // code == 3
+		val = *(uint32_t *) dataPtr; // 4 bytes
+		dataPtr += 4;
+	}
+
+	*dataPtrPtr = dataPtr;
+	return val;
+}
+static const uint8_t *svb_decode_scalar(uint32_t *outPtr, const uint8_t *keyPtr,
+		const uint8_t *dataPtr, uint32_t count) {
+	if (count == 0)
+		return dataPtr; // no reads or writes if no data
+
+	uint8_t shift = 0;
+	uint32_t key = *keyPtr++;
+	for (uint32_t c = 0; c < count; c++) {
+		if (shift == 8) {
+			shift = 0;
+			key = *keyPtr++;
+		}
+		uint32_t val = _decode_data(&dataPtr, (key >> shift) & 0x3);
+		*outPtr++ = val;
+		shift += 2;
+	}
+
+	return dataPtr; // pointer to first unused byte after end
+}
+
+const uint8_t *svb_decode_avx_simple(uint32_t *out,
+		const uint8_t *__restrict__ keyPtr, const uint8_t *__restrict__ dataPtr,
+		uint64_t count) {
+
+	uint64_t keybytes = count / 4; // number of key bytes
+	__m128i Data;
+	if (keybytes >= 8) {
+
+		int64_t Offset = -(int64_t) keybytes / 8 + 1;
+
+		const uint64_t *keyPtr64 = (const uint64_t *) keyPtr - Offset;
+		uint64_t nextkeys = keyPtr64[Offset];
+		for (; Offset != 0; ++Offset) {
+			uint64_t keys = nextkeys;
+			nextkeys = keyPtr64[Offset + 1];
+
+			Data = _decode_avx((keys & 0xFF), &dataPtr);
+			_write_avx(out, Data);
+			Data = _decode_avx((keys & 0xFF00) >> 8, &dataPtr);
+			_write_avx(out + 4, Data);
+
+			keys >>= 16;
+			Data = _decode_avx((keys & 0xFF), &dataPtr);
+			_write_avx(out + 8, Data);
+			Data = _decode_avx((keys & 0xFF00) >> 8, &dataPtr);
+			_write_avx(out + 12, Data);
+
+			keys >>= 16;
+			Data = _decode_avx((keys & 0xFF), &dataPtr);
+			_write_avx(out + 16, Data);
+			Data = _decode_avx((keys & 0xFF00) >> 8, &dataPtr);
+			_write_avx(out + 20, Data);
+
+			keys >>= 16;
+			Data = _decode_avx((keys & 0xFF), &dataPtr);
+			_write_avx(out + 24, Data);
+			Data = _decode_avx((keys & 0xFF00) >> 8, &dataPtr);
+			_write_avx(out + 28, Data);
+
+			out += 32;
+		}
+		{
+			uint64_t keys = nextkeys;
+
+			Data = _decode_avx((keys & 0xFF), &dataPtr);
+			_write_avx(out, Data);
+			Data = _decode_avx((keys & 0xFF00) >> 8, &dataPtr);
+			_write_avx(out + 4, Data);
+
+			keys >>= 16;
+			Data = _decode_avx((keys & 0xFF), &dataPtr);
+			_write_avx(out + 8, Data);
+			Data = _decode_avx((keys & 0xFF00) >> 8, &dataPtr);
+			_write_avx(out + 12, Data);
+
+			keys >>= 16;
+			Data = _decode_avx((keys & 0xFF), &dataPtr);
+			_write_avx(out + 16, Data);
+			Data = _decode_avx((keys & 0xFF00) >> 8, &dataPtr);
+			_write_avx(out + 20, Data);
+
+			keys >>= 16;
+			Data = _decode_avx((keys & 0xFF), &dataPtr);
+			_write_avx(out + 24, Data);
+			Data = _decode_avx((keys & 0xFF00) >> 8, &dataPtr);
+			_write_avx(out + 28, Data);
+
+			out += 32;
+		}
+	}
+	uint64_t consumedkeys = keybytes - (keybytes & 7);
+	return svb_decode_scalar(out, keyPtr + consumedkeys, dataPtr, count & 31);
+}
+
+// Read count 32-bit integers in maskedvbyte format from in, storing the result in out.  Returns the number of bytes read.
+size_t streamvbyte_decode(const uint8_t* in, uint32_t* out, uint32_t count) {
+	if (count == 0)
+		return 0;
+	const uint8_t *keyPtr = in;            // full list of keys is next
+	uint32_t keyLen = ((count + 3) / 4); // 2-bits per key (rounded up)
+	const uint8_t *dataPtr = keyPtr + keyLen;  // data starts at end of keys
+	return svb_decode_avx_simple(out, keyPtr, dataPtr, count) - in;
+
+}
--- a/cpp/streamvbyte/src/streamvbytedelta.c
+++ b/cpp/streamvbyte/src/streamvbytedelta.c
@@ -0,0 +1,575 @@
+#include "streamvbyte.h"
+#if defined(_MSC_VER)
+     /* Microsoft C/C++-compatible compiler */
+     #include <intrin.h>
+#elif defined(__GNUC__) && (defined(__x86_64__) || defined(__i386__))
+     /* GCC-compatible compiler, targeting x86/x86-64 */
+     #include <x86intrin.h>
+#elif defined(__GNUC__) && defined(__ARM_NEON__)
+     /* GCC-compatible compiler, targeting ARM with NEON */
+     #include <arm_neon.h>
+#elif defined(__GNUC__) && defined(__IWMMXT__)
+     /* GCC-compatible compiler, targeting ARM with WMMX */
+     #include <mmintrin.h>
+#elif (defined(__GNUC__) || defined(__xlC__)) && (defined(__VEC__) || defined(__ALTIVEC__))
+     /* XLC or GCC-compatible compiler, targeting PowerPC with VMX/VSX */
+     #include <altivec.h>
+#elif defined(__GNUC__) && defined(__SPE__)
+     /* GCC-compatible compiler, targeting PowerPC with SPE */
+     #include <spe.h>
+#endif
+
+static uint8_t lengthTable[256] = { 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9,
+		10, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10, 8, 9, 10, 11, 6, 7, 8, 9, 7, 8,
+		9, 10, 8, 9, 10, 11, 9, 10, 11, 12, 7, 8, 9, 10, 8, 9, 10, 11, 9, 10,
+		11, 12, 10, 11, 12, 13, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10, 8, 9, 10,
+		11, 6, 7, 8, 9, 7, 8, 9, 10, 8, 9, 10, 11, 9, 10, 11, 12, 7, 8, 9, 10,
+		8, 9, 10, 11, 9, 10, 11, 12, 10, 11, 12, 13, 8, 9, 10, 11, 9, 10, 11,
+		12, 10, 11, 12, 13, 11, 12, 13, 14, 6, 7, 8, 9, 7, 8, 9, 10, 8, 9, 10,
+		11, 9, 10, 11, 12, 7, 8, 9, 10, 8, 9, 10, 11, 9, 10, 11, 12, 10, 11, 12,
+		13, 8, 9, 10, 11, 9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14, 9, 10,
+		11, 12, 10, 11, 12, 13, 11, 12, 13, 14, 12, 13, 14, 15, 7, 8, 9, 10, 8,
+		9, 10, 11, 9, 10, 11, 12, 10, 11, 12, 13, 8, 9, 10, 11, 9, 10, 11, 12,
+		10, 11, 12, 13, 11, 12, 13, 14, 9, 10, 11, 12, 10, 11, 12, 13, 11, 12,
+		13, 14, 12, 13, 14, 15, 10, 11, 12, 13, 11, 12, 13, 14, 12, 13, 14, 15,
+		13, 14, 15, 16 };
+
+static uint8_t shuffleTable[256][16] = { { 0, -1, -1, -1, 1, -1, -1, -1, 2, -1,
+		-1, -1, 3, -1, -1, -1 }, // 1111
+		{ 0, 1, -1, -1, 2, -1, -1, -1, 3, -1, -1, -1, 4, -1, -1, -1 },  // 2111
+		{ 0, 1, 2, -1, 3, -1, -1, -1, 4, -1, -1, -1, 5, -1, -1, -1 },   // 3111
+		{ 0, 1, 2, 3, 4, -1, -1, -1, 5, -1, -1, -1, 6, -1, -1, -1 },    // 4111
+		{ 0, -1, -1, -1, 1, 2, -1, -1, 3, -1, -1, -1, 4, -1, -1, -1 },  // 1211
+		{ 0, 1, -1, -1, 2, 3, -1, -1, 4, -1, -1, -1, 5, -1, -1, -1 },   // 2211
+		{ 0, 1, 2, -1, 3, 4, -1, -1, 5, -1, -1, -1, 6, -1, -1, -1 },    // 3211
+		{ 0, 1, 2, 3, 4, 5, -1, -1, 6, -1, -1, -1, 7, -1, -1, -1 },     // 4211
+		{ 0, -1, -1, -1, 1, 2, 3, -1, 4, -1, -1, -1, 5, -1, -1, -1 },   // 1311
+		{ 0, 1, -1, -1, 2, 3, 4, -1, 5, -1, -1, -1, 6, -1, -1, -1 },    // 2311
+		{ 0, 1, 2, -1, 3, 4, 5, -1, 6, -1, -1, -1, 7, -1, -1, -1 },     // 3311
+		{ 0, 1, 2, 3, 4, 5, 6, -1, 7, -1, -1, -1, 8, -1, -1, -1 },      // 4311
+		{ 0, -1, -1, -1, 1, 2, 3, 4, 5, -1, -1, -1, 6, -1, -1, -1 },    // 1411
+		{ 0, 1, -1, -1, 2, 3, 4, 5, 6, -1, -1, -1, 7, -1, -1, -1 },     // 2411
+		{ 0, 1, 2, -1, 3, 4, 5, 6, 7, -1, -1, -1, 8, -1, -1, -1 },      // 3411
+		{ 0, 1, 2, 3, 4, 5, 6, 7, 8, -1, -1, -1, 9, -1, -1, -1 },       // 4411
+		{ 0, -1, -1, -1, 1, -1, -1, -1, 2, 3, -1, -1, 4, -1, -1, -1 },  // 1121
+		{ 0, 1, -1, -1, 2, -1, -1, -1, 3, 4, -1, -1, 5, -1, -1, -1 },   // 2121
+		{ 0, 1, 2, -1, 3, -1, -1, -1, 4, 5, -1, -1, 6, -1, -1, -1 },    // 3121
+		{ 0, 1, 2, 3, 4, -1, -1, -1, 5, 6, -1, -1, 7, -1, -1, -1 },     // 4121
+		{ 0, -1, -1, -1, 1, 2, -1, -1, 3, 4, -1, -1, 5, -1, -1, -1 },   // 1221
+		{ 0, 1, -1, -1, 2, 3, -1, -1, 4, 5, -1, -1, 6, -1, -1, -1 },    // 2221
+		{ 0, 1, 2, -1, 3, 4, -1, -1, 5, 6, -1, -1, 7, -1, -1, -1 },     // 3221
+		{ 0, 1, 2, 3, 4, 5, -1, -1, 6, 7, -1, -1, 8, -1, -1, -1 },      // 4221
+		{ 0, -1, -1, -1, 1, 2, 3, -1, 4, 5, -1, -1, 6, -1, -1, -1 },    // 1321
+		{ 0, 1, -1, -1, 2, 3, 4, -1, 5, 6, -1, -1, 7, -1, -1, -1 },     // 2321
+		{ 0, 1, 2, -1, 3, 4, 5, -1, 6, 7, -1, -1, 8, -1, -1, -1 },      // 3321
+		{ 0, 1, 2, 3, 4, 5, 6, -1, 7, 8, -1, -1, 9, -1, -1, -1 },       // 4321
+		{ 0, -1, -1, -1, 1, 2, 3, 4, 5, 6, -1, -1, 7, -1, -1, -1 },     // 1421
+		{ 0, 1, -1, -1, 2, 3, 4, 5, 6, 7, -1, -1, 8, -1, -1, -1 },      // 2421
+		{ 0, 1, 2, -1, 3, 4, 5, 6, 7, 8, -1, -1, 9, -1, -1, -1 },       // 3421
+		{ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, -1, -1, 10, -1, -1, -1 },       // 4421
+		{ 0, -1, -1, -1, 1, -1, -1, -1, 2, 3, 4, -1, 5, -1, -1, -1 },   // 1131
+		{ 0, 1, -1, -1, 2, -1, -1, -1, 3, 4, 5, -1, 6, -1, -1, -1 },    // 2131
+		{ 0, 1, 2, -1, 3, -1, -1, -1, 4, 5, 6, -1, 7, -1, -1, -1 },     // 3131
+		{ 0, 1, 2, 3, 4, -1, -1, -1, 5, 6, 7, -1, 8, -1, -1, -1 },      // 4131
+		{ 0, -1, -1, -1, 1, 2, -1, -1, 3, 4, 5, -1, 6, -1, -1, -1 },    // 1231
+		{ 0, 1, -1, -1, 2, 3, -1, -1, 4, 5, 6, -1, 7, -1, -1, -1 },     // 2231
+		{ 0, 1, 2, -1, 3, 4, -1, -1, 5, 6, 7, -1, 8, -1, -1, -1 },      // 3231
+		{ 0, 1, 2, 3, 4, 5, -1, -1, 6, 7, 8, -1, 9, -1, -1, -1 },       // 4231
+		{ 0, -1, -1, -1, 1, 2, 3, -1, 4, 5, 6, -1, 7, -1, -1, -1 },     // 1331
+		{ 0, 1, -1, -1, 2, 3, 4, -1, 5, 6, 7, -1, 8, -1, -1, -1 },      // 2331
+		{ 0, 1, 2, -1, 3, 4, 5, -1, 6, 7, 8, -1, 9, -1, -1, -1 },       // 3331
+		{ 0, 1, 2, 3, 4, 5, 6, -1, 7, 8, 9, -1, 10, -1, -1, -1 },       // 4331
+		{ 0, -1, -1, -1, 1, 2, 3, 4, 5, 6, 7, -1, 8, -1, -1, -1 },      // 1431
+		{ 0, 1, -1, -1, 2, 3, 4, 5, 6, 7, 8, -1, 9, -1, -1, -1 },       // 2431
+		{ 0, 1, 2, -1, 3, 4, 5, 6, 7, 8, 9, -1, 10, -1, -1, -1 },       // 3431
+		{ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, -1, 11, -1, -1, -1 },       // 4431
+		{ 0, -1, -1, -1, 1, -1, -1, -1, 2, 3, 4, 5, 6, -1, -1, -1 },    // 1141
+		{ 0, 1, -1, -1, 2, -1, -1, -1, 3, 4, 5, 6, 7, -1, -1, -1 },     // 2141
+		{ 0, 1, 2, -1, 3, -1, -1, -1, 4, 5, 6, 7, 8, -1, -1, -1 },      // 3141
+		{ 0, 1, 2, 3, 4, -1, -1, -1, 5, 6, 7, 8, 9, -1, -1, -1 },       // 4141
+		{ 0, -1, -1, -1, 1, 2, -1, -1, 3, 4, 5, 6, 7, -1, -1, -1 },     // 1241
+		{ 0, 1, -1, -1, 2, 3, -1, -1, 4, 5, 6, 7, 8, -1, -1, -1 },      // 2241
+		{ 0, 1, 2, -1, 3, 4, -1, -1, 5, 6, 7, 8, 9, -1, -1, -1 },       // 3241
+		{ 0, 1, 2, 3, 4, 5, -1, -1, 6, 7, 8, 9, 10, -1, -1, -1 },       // 4241
+		{ 0, -1, -1, -1, 1, 2, 3, -1, 4, 5, 6, 7, 8, -1, -1, -1 },      // 1341
+		{ 0, 1, -1, -1, 2, 3, 4, -1, 5, 6, 7, 8, 9, -1, -1, -1 },       // 2341
+		{ 0, 1, 2, -1, 3, 4, 5, -1, 6, 7, 8, 9, 10, -1, -1, -1 },       // 3341
+		{ 0, 1, 2, 3, 4, 5, 6, -1, 7, 8, 9, 10, 11, -1, -1, -1 },       // 4341
+		{ 0, -1, -1, -1, 1, 2, 3, 4, 5, 6, 7, 8, 9, -1, -1, -1 },       // 1441
+		{ 0, 1, -1, -1, 2, 3, 4, 5, 6, 7, 8, 9, 10, -1, -1, -1 },       // 2441
+		{ 0, 1, 2, -1, 3, 4, 5, 6, 7, 8, 9, 10, 11, -1, -1, -1 },       // 3441
+		{ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, -1, -1, -1 },       // 4441
+		{ 0, -1, -1, -1, 1, -1, -1, -1, 2, -1, -1, -1, 3, 4, -1, -1 },  // 1112
+		{ 0, 1, -1, -1, 2, -1, -1, -1, 3, -1, -1, -1, 4, 5, -1, -1 },   // 2112
+		{ 0, 1, 2, -1, 3, -1, -1, -1, 4, -1, -1, -1, 5, 6, -1, -1 },    // 3112
+		{ 0, 1, 2, 3, 4, -1, -1, -1, 5, -1, -1, -1, 6, 7, -1, -1 },     // 4112
+		{ 0, -1, -1, -1, 1, 2, -1, -1, 3, -1, -1, -1, 4, 5, -1, -1 },   // 1212
+		{ 0, 1, -1, -1, 2, 3, -1, -1, 4, -1, -1, -1, 5, 6, -1, -1 },    // 2212
+		{ 0, 1, 2, -1, 3, 4, -1, -1, 5, -1, -1, -1, 6, 7, -1, -1 },     // 3212
+		{ 0, 1, 2, 3, 4, 5, -1, -1, 6, -1, -1, -1, 7, 8, -1, -1 },      // 4212
+		{ 0, -1, -1, -1, 1, 2, 3, -1, 4, -1, -1, -1, 5, 6, -1, -1 },    // 1312
+		{ 0, 1, -1, -1, 2, 3, 4, -1, 5, -1, -1, -1, 6, 7, -1, -1 },     // 2312
+		{ 0, 1, 2, -1, 3, 4, 5, -1, 6, -1, -1, -1, 7, 8, -1, -1 },      // 3312
+		{ 0, 1, 2, 3, 4, 5, 6, -1, 7, -1, -1, -1, 8, 9, -1, -1 },       // 4312
+		{ 0, -1, -1, -1, 1, 2, 3, 4, 5, -1, -1, -1, 6, 7, -1, -1 },     // 1412
+		{ 0, 1, -1, -1, 2, 3, 4, 5, 6, -1, -1, -1, 7, 8, -1, -1 },      // 2412
+		{ 0, 1, 2, -1, 3, 4, 5, 6, 7, -1, -1, -1, 8, 9, -1, -1 },       // 3412
+		{ 0, 1, 2, 3, 4, 5, 6, 7, 8, -1, -1, -1, 9, 10, -1, -1 },       // 4412
+		{ 0, -1, -1, -1, 1, -1, -1, -1, 2, 3, -1, -1, 4, 5, -1, -1 },   // 1122
+		{ 0, 1, -1, -1, 2, -1, -1, -1, 3, 4, -1, -1, 5, 6, -1, -1 },    // 2122
+		{ 0, 1, 2, -1, 3, -1, -1, -1, 4, 5, -1, -1, 6, 7, -1, -1 },     // 3122
+		{ 0, 1, 2, 3, 4, -1, -1, -1, 5, 6, -1, -1, 7, 8, -1, -1 },      // 4122
+		{ 0, -1, -1, -1, 1, 2, -1, -1, 3, 4, -1, -1, 5, 6, -1, -1 },    // 1222
+		{ 0, 1, -1, -1, 2, 3, -1, -1, 4, 5, -1, -1, 6, 7, -1, -1 },     // 2222
+		{ 0, 1, 2, -1, 3, 4, -1, -1, 5, 6, -1, -1, 7, 8, -1, -1 },      // 3222
+		{ 0, 1, 2, 3, 4, 5, -1, -1, 6, 7, -1, -1, 8, 9, -1, -1 },       // 4222
+		{ 0, -1, -1, -1, 1, 2, 3, -1, 4, 5, -1, -1, 6, 7, -1, -1 },     // 1322
+		{ 0, 1, -1, -1, 2, 3, 4, -1, 5, 6, -1, -1, 7, 8, -1, -1 },      // 2322
+		{ 0, 1, 2, -1, 3, 4, 5, -1, 6, 7, -1, -1, 8, 9, -1, -1 },       // 3322
+		{ 0, 1, 2, 3, 4, 5, 6, -1, 7, 8, -1, -1, 9, 10, -1, -1 },       // 4322
+		{ 0, -1, -1, -1, 1, 2, 3, 4, 5, 6, -1, -1, 7, 8, -1, -1 },      // 1422
+		{ 0, 1, -1, -1, 2, 3, 4, 5, 6, 7, -1, -1, 8, 9, -1, -1 },       // 2422
+		{ 0, 1, 2, -1, 3, 4, 5, 6, 7, 8, -1, -1, 9, 10, -1, -1 },       // 3422
+		{ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, -1, -1, 10, 11, -1, -1 },       // 4422
+		{ 0, -1, -1, -1, 1, -1, -1, -1, 2, 3, 4, -1, 5, 6, -1, -1 },    // 1132
+		{ 0, 1, -1, -1, 2, -1, -1, -1, 3, 4, 5, -1, 6, 7, -1, -1 },     // 2132
+		{ 0, 1, 2, -1, 3, -1, -1, -1, 4, 5, 6, -1, 7, 8, -1, -1 },      // 3132
+		{ 0, 1, 2, 3, 4, -1, -1, -1, 5, 6, 7, -1, 8, 9, -1, -1 },       // 4132
+		{ 0, -1, -1, -1, 1, 2, -1, -1, 3, 4, 5, -1, 6, 7, -1, -1 },     // 1232
+		{ 0, 1, -1, -1, 2, 3, -1, -1, 4, 5, 6, -1, 7, 8, -1, -1 },      // 2232
+		{ 0, 1, 2, -1, 3, 4, -1, -1, 5, 6, 7, -1, 8, 9, -1, -1 },       // 3232
+		{ 0, 1, 2, 3, 4, 5, -1, -1, 6, 7, 8, -1, 9, 10, -1, -1 },       // 4232
+		{ 0, -1, -1, -1, 1, 2, 3, -1, 4, 5, 6, -1, 7, 8, -1, -1 },      // 1332
+		{ 0, 1, -1, -1, 2, 3, 4, -1, 5, 6, 7, -1, 8, 9, -1, -1 },       // 2332
+		{ 0, 1, 2, -1, 3, 4, 5, -1, 6, 7, 8, -1, 9, 10, -1, -1 },       // 3332
+		{ 0, 1, 2, 3, 4, 5, 6, -1, 7, 8, 9, -1, 10, 11, -1, -1 },       // 4332
+		{ 0, -1, -1, -1, 1, 2, 3, 4, 5, 6, 7, -1, 8, 9, -1, -1 },       // 1432
+		{ 0, 1, -1, -1, 2, 3, 4, 5, 6, 7, 8, -1, 9, 10, -1, -1 },       // 2432
+		{ 0, 1, 2, -1, 3, 4, 5, 6, 7, 8, 9, -1, 10, 11, -1, -1 },       // 3432
+		{ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, -1, 11, 12, -1, -1 },       // 4432
+		{ 0, -1, -1, -1, 1, -1, -1, -1, 2, 3, 4, 5, 6, 7, -1, -1 },     // 1142
+		{ 0, 1, -1, -1, 2, -1, -1, -1, 3, 4, 5, 6, 7, 8, -1, -1 },      // 2142
+		{ 0, 1, 2, -1, 3, -1, -1, -1, 4, 5, 6, 7, 8, 9, -1, -1 },       // 3142
+		{ 0, 1, 2, 3, 4, -1, -1, -1, 5, 6, 7, 8, 9, 10, -1, -1 },       // 4142
+		{ 0, -1, -1, -1, 1, 2, -1, -1, 3, 4, 5, 6, 7, 8, -1, -1 },      // 1242
+		{ 0, 1, -1, -1, 2, 3, -1, -1, 4, 5, 6, 7, 8, 9, -1, -1 },       // 2242
+		{ 0, 1, 2, -1, 3, 4, -1, -1, 5, 6, 7, 8, 9, 10, -1, -1 },       // 3242
+		{ 0, 1, 2, 3, 4, 5, -1, -1, 6, 7, 8, 9, 10, 11, -1, -1 },       // 4242
+		{ 0, -1, -1, -1, 1, 2, 3, -1, 4, 5, 6, 7, 8, 9, -1, -1 },       // 1342
+		{ 0, 1, -1, -1, 2, 3, 4, -1, 5, 6, 7, 8, 9, 10, -1, -1 },       // 2342
+		{ 0, 1, 2, -1, 3, 4, 5, -1, 6, 7, 8, 9, 10, 11, -1, -1 },       // 3342
+		{ 0, 1, 2, 3, 4, 5, 6, -1, 7, 8, 9, 10, 11, 12, -1, -1 },       // 4342
+		{ 0, -1, -1, -1, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, -1, -1 },       // 1442
+		{ 0, 1, -1, -1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, -1, -1 },       // 2442
+		{ 0, 1, 2, -1, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, -1, -1 },       // 3442
+		{ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, -1, -1 },       // 4442
+		{ 0, -1, -1, -1, 1, -1, -1, -1, 2, -1, -1, -1, 3, 4, 5, -1 },   // 1113
+		{ 0, 1, -1, -1, 2, -1, -1, -1, 3, -1, -1, -1, 4, 5, 6, -1 },    // 2113
+		{ 0, 1, 2, -1, 3, -1, -1, -1, 4, -1, -1, -1, 5, 6, 7, -1 },     // 3113
+		{ 0, 1, 2, 3, 4, -1, -1, -1, 5, -1, -1, -1, 6, 7, 8, -1 },      // 4113
+		{ 0, -1, -1, -1, 1, 2, -1, -1, 3, -1, -1, -1, 4, 5, 6, -1 },    // 1213
+		{ 0, 1, -1, -1, 2, 3, -1, -1, 4, -1, -1, -1, 5, 6, 7, -1 },     // 2213
+		{ 0, 1, 2, -1, 3, 4, -1, -1, 5, -1, -1, -1, 6, 7, 8, -1 },      // 3213
+		{ 0, 1, 2, 3, 4, 5, -1, -1, 6, -1, -1, -1, 7, 8, 9, -1 },       // 4213
+		{ 0, -1, -1, -1, 1, 2, 3, -1, 4, -1, -1, -1, 5, 6, 7, -1 },     // 1313
+		{ 0, 1, -1, -1, 2, 3, 4, -1, 5, -1, -1, -1, 6, 7, 8, -1 },      // 2313
+		{ 0, 1, 2, -1, 3, 4, 5, -1, 6, -1, -1, -1, 7, 8, 9, -1 },       // 3313
+		{ 0, 1, 2, 3, 4, 5, 6, -1, 7, -1, -1, -1, 8, 9, 10, -1 },       // 4313
+		{ 0, -1, -1, -1, 1, 2, 3, 4, 5, -1, -1, -1, 6, 7, 8, -1 },      // 1413
+		{ 0, 1, -1, -1, 2, 3, 4, 5, 6, -1, -1, -1, 7, 8, 9, -1 },       // 2413
+		{ 0, 1, 2, -1, 3, 4, 5, 6, 7, -1, -1, -1, 8, 9, 10, -1 },       // 3413
+		{ 0, 1, 2, 3, 4, 5, 6, 7, 8, -1, -1, -1, 9, 10, 11, -1 },       // 4413
+		{ 0, -1, -1, -1, 1, -1, -1, -1, 2, 3, -1, -1, 4, 5, 6, -1 },    // 1123
+		{ 0, 1, -1, -1, 2, -1, -1, -1, 3, 4, -1, -1, 5, 6, 7, -1 },     // 2123
+		{ 0, 1, 2, -1, 3, -1, -1, -1, 4, 5, -1, -1, 6, 7, 8, -1 },      // 3123
+		{ 0, 1, 2, 3, 4, -1, -1, -1, 5, 6, -1, -1, 7, 8, 9, -1 },       // 4123
+		{ 0, -1, -1, -1, 1, 2, -1, -1, 3, 4, -1, -1, 5, 6, 7, -1 },     // 1223
+		{ 0, 1, -1, -1, 2, 3, -1, -1, 4, 5, -1, -1, 6, 7, 8, -1 },      // 2223
+		{ 0, 1, 2, -1, 3, 4, -1, -1, 5, 6, -1, -1, 7, 8, 9, -1 },       // 3223
+		{ 0, 1, 2, 3, 4, 5, -1, -1, 6, 7, -1, -1, 8, 9, 10, -1 },       // 4223
+		{ 0, -1, -1, -1, 1, 2, 3, -1, 4, 5, -1, -1, 6, 7, 8, -1 },      // 1323
+		{ 0, 1, -1, -1, 2, 3, 4, -1, 5, 6, -1, -1, 7, 8, 9, -1 },       // 2323
+		{ 0, 1, 2, -1, 3, 4, 5, -1, 6, 7, -1, -1, 8, 9, 10, -1 },       // 3323
+		{ 0, 1, 2, 3, 4, 5, 6, -1, 7, 8, -1, -1, 9, 10, 11, -1 },       // 4323
+		{ 0, -1, -1, -1, 1, 2, 3, 4, 5, 6, -1, -1, 7, 8, 9, -1 },       // 1423
+		{ 0, 1, -1, -1, 2, 3, 4, 5, 6, 7, -1, -1, 8, 9, 10, -1 },       // 2423
+		{ 0, 1, 2, -1, 3, 4, 5, 6, 7, 8, -1, -1, 9, 10, 11, -1 },       // 3423
+		{ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, -1, -1, 10, 11, 12, -1 },       // 4423
+		{ 0, -1, -1, -1, 1, -1, -1, -1, 2, 3, 4, -1, 5, 6, 7, -1 },     // 1133
+		{ 0, 1, -1, -1, 2, -1, -1, -1, 3, 4, 5, -1, 6, 7, 8, -1 },      // 2133
+		{ 0, 1, 2, -1, 3, -1, -1, -1, 4, 5, 6, -1, 7, 8, 9, -1 },       // 3133
+		{ 0, 1, 2, 3, 4, -1, -1, -1, 5, 6, 7, -1, 8, 9, 10, -1 },       // 4133
+		{ 0, -1, -1, -1, 1, 2, -1, -1, 3, 4, 5, -1, 6, 7, 8, -1 },      // 1233
+		{ 0, 1, -1, -1, 2, 3, -1, -1, 4, 5, 6, -1, 7, 8, 9, -1 },       // 2233
+		{ 0, 1, 2, -1, 3, 4, -1, -1, 5, 6, 7, -1, 8, 9, 10, -1 },       // 3233
+		{ 0, 1, 2, 3, 4, 5, -1, -1, 6, 7, 8, -1, 9, 10, 11, -1 },       // 4233
+		{ 0, -1, -1, -1, 1, 2, 3, -1, 4, 5, 6, -1, 7, 8, 9, -1 },       // 1333
+		{ 0, 1, -1, -1, 2, 3, 4, -1, 5, 6, 7, -1, 8, 9, 10, -1 },       // 2333
+		{ 0, 1, 2, -1, 3, 4, 5, -1, 6, 7, 8, -1, 9, 10, 11, -1 },       // 3333
+		{ 0, 1, 2, 3, 4, 5, 6, -1, 7, 8, 9, -1, 10, 11, 12, -1 },       // 4333
+		{ 0, -1, -1, -1, 1, 2, 3, 4, 5, 6, 7, -1, 8, 9, 10, -1 },       // 1433
+		{ 0, 1, -1, -1, 2, 3, 4, 5, 6, 7, 8, -1, 9, 10, 11, -1 },       // 2433
+		{ 0, 1, 2, -1, 3, 4, 5, 6, 7, 8, 9, -1, 10, 11, 12, -1 },       // 3433
+		{ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, -1, 11, 12, 13, -1 },       // 4433
+		{ 0, -1, -1, -1, 1, -1, -1, -1, 2, 3, 4, 5, 6, 7, 8, -1 },      // 1143
+		{ 0, 1, -1, -1, 2, -1, -1, -1, 3, 4, 5, 6, 7, 8, 9, -1 },       // 2143
+		{ 0, 1, 2, -1, 3, -1, -1, -1, 4, 5, 6, 7, 8, 9, 10, -1 },       // 3143
+		{ 0, 1, 2, 3, 4, -1, -1, -1, 5, 6, 7, 8, 9, 10, 11, -1 },       // 4143
+		{ 0, -1, -1, -1, 1, 2, -1, -1, 3, 4, 5, 6, 7, 8, 9, -1 },       // 1243
+		{ 0, 1, -1, -1, 2, 3, -1, -1, 4, 5, 6, 7, 8, 9, 10, -1 },       // 2243
+		{ 0, 1, 2, -1, 3, 4, -1, -1, 5, 6, 7, 8, 9, 10, 11, -1 },       // 3243
+		{ 0, 1, 2, 3, 4, 5, -1, -1, 6, 7, 8, 9, 10, 11, 12, -1 },       // 4243
+		{ 0, -1, -1, -1, 1, 2, 3, -1, 4, 5, 6, 7, 8, 9, 10, -1 },       // 1343
+		{ 0, 1, -1, -1, 2, 3, 4, -1, 5, 6, 7, 8, 9, 10, 11, -1 },       // 2343
+		{ 0, 1, 2, -1, 3, 4, 5, -1, 6, 7, 8, 9, 10, 11, 12, -1 },       // 3343
+		{ 0, 1, 2, 3, 4, 5, 6, -1, 7, 8, 9, 10, 11, 12, 13, -1 },       // 4343
+		{ 0, -1, -1, -1, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, -1 },       // 1443
+		{ 0, 1, -1, -1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, -1 },       // 2443
+		{ 0, 1, 2, -1, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, -1 },       // 3443
+		{ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, -1 },       // 4443
+		{ 0, -1, -1, -1, 1, -1, -1, -1, 2, -1, -1, -1, 3, 4, 5, 6 },    // 1114
+		{ 0, 1, -1, -1, 2, -1, -1, -1, 3, -1, -1, -1, 4, 5, 6, 7 },     // 2114
+		{ 0, 1, 2, -1, 3, -1, -1, -1, 4, -1, -1, -1, 5, 6, 7, 8 },      // 3114
+		{ 0, 1, 2, 3, 4, -1, -1, -1, 5, -1, -1, -1, 6, 7, 8, 9 },       // 4114
+		{ 0, -1, -1, -1, 1, 2, -1, -1, 3, -1, -1, -1, 4, 5, 6, 7 },     // 1214
+		{ 0, 1, -1, -1, 2, 3, -1, -1, 4, -1, -1, -1, 5, 6, 7, 8 },      // 2214
+		{ 0, 1, 2, -1, 3, 4, -1, -1, 5, -1, -1, -1, 6, 7, 8, 9 },       // 3214
+		{ 0, 1, 2, 3, 4, 5, -1, -1, 6, -1, -1, -1, 7, 8, 9, 10 },       // 4214
+		{ 0, -1, -1, -1, 1, 2, 3, -1, 4, -1, -1, -1, 5, 6, 7, 8 },      // 1314
+		{ 0, 1, -1, -1, 2, 3, 4, -1, 5, -1, -1, -1, 6, 7, 8, 9 },       // 2314
+		{ 0, 1, 2, -1, 3, 4, 5, -1, 6, -1, -1, -1, 7, 8, 9, 10 },       // 3314
+		{ 0, 1, 2, 3, 4, 5, 6, -1, 7, -1, -1, -1, 8, 9, 10, 11 },       // 4314
+		{ 0, -1, -1, -1, 1, 2, 3, 4, 5, -1, -1, -1, 6, 7, 8, 9 },       // 1414
+		{ 0, 1, -1, -1, 2, 3, 4, 5, 6, -1, -1, -1, 7, 8, 9, 10 },       // 2414
+		{ 0, 1, 2, -1, 3, 4, 5, 6, 7, -1, -1, -1, 8, 9, 10, 11 },       // 3414
+		{ 0, 1, 2, 3, 4, 5, 6, 7, 8, -1, -1, -1, 9, 10, 11, 12 },       // 4414
+		{ 0, -1, -1, -1, 1, -1, -1, -1, 2, 3, -1, -1, 4, 5, 6, 7 },     // 1124
+		{ 0, 1, -1, -1, 2, -1, -1, -1, 3, 4, -1, -1, 5, 6, 7, 8 },      // 2124
+		{ 0, 1, 2, -1, 3, -1, -1, -1, 4, 5, -1, -1, 6, 7, 8, 9 },       // 3124
+		{ 0, 1, 2, 3, 4, -1, -1, -1, 5, 6, -1, -1, 7, 8, 9, 10 },       // 4124
+		{ 0, -1, -1, -1, 1, 2, -1, -1, 3, 4, -1, -1, 5, 6, 7, 8 },      // 1224
+		{ 0, 1, -1, -1, 2, 3, -1, -1, 4, 5, -1, -1, 6, 7, 8, 9 },       // 2224
+		{ 0, 1, 2, -1, 3, 4, -1, -1, 5, 6, -1, -1, 7, 8, 9, 10 },       // 3224
+		{ 0, 1, 2, 3, 4, 5, -1, -1, 6, 7, -1, -1, 8, 9, 10, 11 },       // 4224
+		{ 0, -1, -1, -1, 1, 2, 3, -1, 4, 5, -1, -1, 6, 7, 8, 9 },       // 1324
+		{ 0, 1, -1, -1, 2, 3, 4, -1, 5, 6, -1, -1, 7, 8, 9, 10 },       // 2324
+		{ 0, 1, 2, -1, 3, 4, 5, -1, 6, 7, -1, -1, 8, 9, 10, 11 },       // 3324
+		{ 0, 1, 2, 3, 4, 5, 6, -1, 7, 8, -1, -1, 9, 10, 11, 12 },       // 4324
+		{ 0, -1, -1, -1, 1, 2, 3, 4, 5, 6, -1, -1, 7, 8, 9, 10 },       // 1424
+		{ 0, 1, -1, -1, 2, 3, 4, 5, 6, 7, -1, -1, 8, 9, 10, 11 },       // 2424
+		{ 0, 1, 2, -1, 3, 4, 5, 6, 7, 8, -1, -1, 9, 10, 11, 12 },       // 3424
+		{ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, -1, -1, 10, 11, 12, 13 },       // 4424
+		{ 0, -1, -1, -1, 1, -1, -1, -1, 2, 3, 4, -1, 5, 6, 7, 8 },      // 1134
+		{ 0, 1, -1, -1, 2, -1, -1, -1, 3, 4, 5, -1, 6, 7, 8, 9 },       // 2134
+		{ 0, 1, 2, -1, 3, -1, -1, -1, 4, 5, 6, -1, 7, 8, 9, 10 },       // 3134
+		{ 0, 1, 2, 3, 4, -1, -1, -1, 5, 6, 7, -1, 8, 9, 10, 11 },       // 4134
+		{ 0, -1, -1, -1, 1, 2, -1, -1, 3, 4, 5, -1, 6, 7, 8, 9 },       // 1234
+		{ 0, 1, -1, -1, 2, 3, -1, -1, 4, 5, 6, -1, 7, 8, 9, 10 },       // 2234
+		{ 0, 1, 2, -1, 3, 4, -1, -1, 5, 6, 7, -1, 8, 9, 10, 11 },       // 3234
+		{ 0, 1, 2, 3, 4, 5, -1, -1, 6, 7, 8, -1, 9, 10, 11, 12 },       // 4234
+		{ 0, -1, -1, -1, 1, 2, 3, -1, 4, 5, 6, -1, 7, 8, 9, 10 },       // 1334
+		{ 0, 1, -1, -1, 2, 3, 4, -1, 5, 6, 7, -1, 8, 9, 10, 11 },       // 2334
+		{ 0, 1, 2, -1, 3, 4, 5, -1, 6, 7, 8, -1, 9, 10, 11, 12 },       // 3334
+		{ 0, 1, 2, 3, 4, 5, 6, -1, 7, 8, 9, -1, 10, 11, 12, 13 },       // 4334
+		{ 0, -1, -1, -1, 1, 2, 3, 4, 5, 6, 7, -1, 8, 9, 10, 11 },       // 1434
+		{ 0, 1, -1, -1, 2, 3, 4, 5, 6, 7, 8, -1, 9, 10, 11, 12 },       // 2434
+		{ 0, 1, 2, -1, 3, 4, 5, 6, 7, 8, 9, -1, 10, 11, 12, 13 },       // 3434
+		{ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, -1, 11, 12, 13, 14 },       // 4434
+		{ 0, -1, -1, -1, 1, -1, -1, -1, 2, 3, 4, 5, 6, 7, 8, 9 },       // 1144
+		{ 0, 1, -1, -1, 2, -1, -1, -1, 3, 4, 5, 6, 7, 8, 9, 10 },       // 2144
+		{ 0, 1, 2, -1, 3, -1, -1, -1, 4, 5, 6, 7, 8, 9, 10, 11 },       // 3144
+		{ 0, 1, 2, 3, 4, -1, -1, -1, 5, 6, 7, 8, 9, 10, 11, 12 },       // 4144
+		{ 0, -1, -1, -1, 1, 2, -1, -1, 3, 4, 5, 6, 7, 8, 9, 10 },       // 1244
+		{ 0, 1, -1, -1, 2, 3, -1, -1, 4, 5, 6, 7, 8, 9, 10, 11 },       // 2244
+		{ 0, 1, 2, -1, 3, 4, -1, -1, 5, 6, 7, 8, 9, 10, 11, 12 },       // 3244
+		{ 0, 1, 2, 3, 4, 5, -1, -1, 6, 7, 8, 9, 10, 11, 12, 13 },       // 4244
+		{ 0, -1, -1, -1, 1, 2, 3, -1, 4, 5, 6, 7, 8, 9, 10, 11 },       // 1344
+		{ 0, 1, -1, -1, 2, 3, 4, -1, 5, 6, 7, 8, 9, 10, 11, 12 },       // 2344
+		{ 0, 1, 2, -1, 3, 4, 5, -1, 6, 7, 8, 9, 10, 11, 12, 13 },       // 3344
+		{ 0, 1, 2, 3, 4, 5, 6, -1, 7, 8, 9, 10, 11, 12, 13, 14 },       // 4344
+		{ 0, -1, -1, -1, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12 },       // 1444
+		{ 0, 1, -1, -1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13 },       // 2444
+		{ 0, 1, 2, -1, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14 },       // 3444
+		{ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 }        // 4444
+};
+
+static uint8_t _encode_data(uint32_t val, uint8_t *__restrict__ *dataPtrPtr) {
+	uint8_t *dataPtr = *dataPtrPtr;
+	uint8_t code;
+
+	if (val < (1 << 8)) { // 1 byte
+		*dataPtr = (uint8_t)(val);
+		*dataPtrPtr += 1;
+		code = 0;
+	} else if (val < (1 << 16)) { // 2 bytes
+		*(uint16_t *) dataPtr = (uint16_t)(val);
+		*dataPtrPtr += 2;
+		code = 1;
+	} else if (val < (1 << 24)) { // 3 bytes
+		*(uint16_t *) dataPtr = (uint16_t)(val);
+		*(dataPtr + 2) = (uint8_t)(val >> 16);
+		*dataPtrPtr += 3;
+		code = 2;
+	} else { // 4 bytes
+		*(uint32_t *) dataPtr = val;
+		*dataPtrPtr += 4;
+		code = 3;
+	}
+
+	return code;
+}
+
+static uint8_t *svb_encode_scalar_d1_init(const uint32_t *in,
+		uint8_t *__restrict__ keyPtr, uint8_t *__restrict__ dataPtr,
+		uint32_t count, uint32_t prev) {
+	if (count == 0)
+		return dataPtr; // exit immediately if no data
+
+	uint8_t shift = 0; // cycles 0, 2, 4, 6, 0, 2, 4, 6, ...
+	uint8_t key = 0;
+	for (uint32_t c = 0; c < count; c++) {
+		if (shift == 8) {
+			shift = 0;
+			*keyPtr++ = key;
+			key = 0;
+		}
+		uint32_t val = in[c] - prev;
+		prev = in[c];
+		uint8_t code = _encode_data(val, &dataPtr);
+		key |= code << shift;
+		shift += 2;
+	}
+
+	*keyPtr = key;  // write last key (no increment needed)
+	return dataPtr; // pointer to first unused data byte
+}
+
+size_t streamvbyte_delta_encode(const uint32_t *in, uint32_t count, uint8_t *out,
+		uint32_t prev) {
+	uint8_t *keyPtr = out;         // keys come immediately after 32-bit count
+	uint32_t keyLen = (count + 3) / 4; // 2-bits rounded to full byte
+	uint8_t *dataPtr = keyPtr + keyLen; // variable byte data after all keys
+
+	return svb_encode_scalar_d1_init(in, keyPtr, dataPtr, count, prev) - out;
+
+}
+
+static inline __m128i _decode_avx(uint32_t key, const uint8_t *__restrict__ *dataPtrPtr) {
+	uint8_t len = lengthTable[key];
+	__m128i Data = _mm_loadu_si128((__m128i *) *dataPtrPtr);
+	__m128i Shuf = *(__m128i *) &shuffleTable[key];
+
+	Data = _mm_shuffle_epi8(Data, Shuf);
+	*dataPtrPtr += len;
+
+	return Data;
+}
+#define BroadcastLastXMM 0xFF // bits 0-7 all set to choose highest element
+
+
+
+static inline void _write_avx(uint32_t *out, __m128i Vec) {
+	_mm_storeu_si128((__m128i *) out, Vec);
+}
+
+static __m128i _write_avx_d1(uint32_t *out, __m128i Vec, __m128i Prev) {
+	__m128i Add = _mm_slli_si128(Vec, 4); // Cycle 1: [- A B C] (already done)
+	Prev = _mm_shuffle_epi32(Prev, BroadcastLastXMM); // Cycle 2: [P P P P]
+	Vec = _mm_add_epi32(Vec, Add);                    // Cycle 2: [A AB BC CD]
+	Add = _mm_slli_si128(Vec, 8);                     // Cycle 3: [- - A AB]
+	Vec = _mm_add_epi32(Vec, Prev);                 // Cycle 3: [PA PAB PBC PCD]
+	Vec = _mm_add_epi32(Vec, Add); // Cycle 4: [PA PAB PABC PABCD]
+
+	_write_avx(out, Vec);
+	return Vec;
+}
+
+#ifndef _MSC_VER
+static __m128i High16To32 = {0xFFFF0B0AFFFF0908, 0xFFFF0F0EFFFF0D0C};
+#else
+static __m128i High16To32 = {8,  9,  -1, -1, 10, 11, -1, -1,
+                           12, 13, -1, -1, 14, 15, -1, -1};
+#endif
+
+static inline __m128i _write_16bit_avx_d1(uint32_t *out, __m128i Vec, __m128i Prev) {
+  // vec == [A B C D E F G H] (16 bit values)
+  __m128i Add = _mm_slli_si128(Vec, 2);               // [- A B C D E F G]
+  Prev = _mm_shuffle_epi32(Prev, BroadcastLastXMM); // [P P P P] (32-bit)
+  Vec = _mm_add_epi32(Vec, Add);                    // [A AB BC CD DE FG GH]
+  Add = _mm_slli_si128(Vec, 4);                     // [- - A AB BC CD DE EF]
+  Vec = _mm_add_epi32(Vec, Add);      // [A AB ABC ABCD BCDE CDEF DEFG EFGH]
+  __m128i V1 = _mm_cvtepu16_epi32(Vec); // [A AB ABC ABCD] (32-bit)
+  V1 = _mm_add_epi32(V1, Prev);       // [PA PAB PABC PABCD] (32-bit)
+  __m128i V2 =
+      _mm_shuffle_epi8(Vec, High16To32); // [BCDE CDEF DEFG EFGH] (32-bit)
+  V2 = _mm_add_epi32(V1, V2); // [PABCDE PABCDEF PABCDEFG PABCDEFGH] (32-bit)
+  _write_avx(out, V1);
+  _write_avx(out + 4, V2);
+  return V2;
+}
+
+static inline uint32_t _decode_data(const uint8_t **dataPtrPtr, uint8_t code) {
+	const uint8_t *dataPtr = *dataPtrPtr;
+	uint32_t val;
+
+	if (code == 0) { // 1 byte
+		val = (uint32_t) * dataPtr;
+		dataPtr += 1;
+	} else if (code == 1) { // 2 bytes
+		val = (uint32_t) * (uint16_t *) dataPtr;
+		dataPtr += 2;
+	} else if (code == 2) { // 3 bytes
+		val = (uint32_t) * (uint16_t *) dataPtr;
+		val |= *(dataPtr + 2) << 16;
+		dataPtr += 3;
+	} else {                      // code == 3
+		val = *(uint32_t *) dataPtr; // 4 bytes
+		dataPtr += 4;
+	}
+
+	*dataPtrPtr = dataPtr;
+	return val;
+}
+
+const uint8_t *svb_decode_scalar_d1_init(uint32_t *outPtr, const uint8_t *keyPtr,
+		const uint8_t *dataPtr, uint32_t count,
+                                   uint32_t prev) {
+  if (count == 0)
+    return dataPtr; // no reads or writes if no data
+
+  uint8_t shift = 0;
+  uint32_t key = *keyPtr++;
+
+  for (uint32_t c = 0; c < count; c++) {
+    if (shift == 8) {
+      shift = 0;
+      key = *keyPtr++;
+    }
+    uint32_t val = _decode_data(&dataPtr, (key >> shift) & 0x3);
+    val += prev;
+    *outPtr++ = val;
+    prev = val;
+    shift += 2;
+  }
+
+  return dataPtr; // pointer to first unused byte after end
+}
+
+const uint8_t *svb_decode_avx_d1_init(uint32_t *out, const uint8_t *__restrict__ keyPtr,
+		const uint8_t *__restrict__ dataPtr, uint64_t count, uint32_t prev) {
+	uint64_t keybytes = count / 4; // number of key bytes
+	if (keybytes >= 8) {
+		__m128i Prev = _mm_set1_epi32(prev);
+		__m128i Data;
+
+		int64_t Offset = -(int64_t) keybytes / 8 + 1;
+
+		const uint64_t *keyPtr64 = (const uint64_t *) keyPtr - Offset;
+		uint64_t nextkeys = keyPtr64[Offset];
+		for (; Offset != 0; ++Offset) {
+			uint64_t keys = nextkeys;
+			nextkeys = keyPtr64[Offset + 1];
+			// faster 16-bit delta since we only have 8-bit values
+			if (!keys) { // 32 1-byte ints in a row
+
+				Data = _mm_cvtepu8_epi16(_mm_lddqu_si128((__m128i *) (dataPtr)));
+				Prev = _write_16bit_avx_d1(out, Data, Prev);
+				Data = _mm_cvtepu8_epi16(
+						_mm_lddqu_si128((__m128i *) (dataPtr + 8)));
+				Prev = _write_16bit_avx_d1(out + 8, Data, Prev);
+				Data = _mm_cvtepu8_epi16(
+						_mm_lddqu_si128((__m128i *) (dataPtr + 16)));
+				Prev = _write_16bit_avx_d1(out + 16, Data, Prev);
+				Data = _mm_cvtepu8_epi16(
+						_mm_lddqu_si128((__m128i *) (dataPtr + 24)));
+				Prev = _write_16bit_avx_d1(out + 24, Data, Prev);
+				out += 32;
+				dataPtr += 32;
+				continue;
+			}
+
+			Data = _decode_avx(keys & 0x00FF, &dataPtr);
+			Prev = _write_avx_d1(out, Data, Prev);
+			Data = _decode_avx((keys & 0xFF00) >> 8, &dataPtr);
+			Prev = _write_avx_d1(out + 4, Data, Prev);
+
+			keys >>= 16;
+			Data = _decode_avx((keys & 0x00FF), &dataPtr);
+			Prev = _write_avx_d1(out + 8, Data, Prev);
+			Data = _decode_avx((keys & 0xFF00) >> 8, &dataPtr);
+			Prev = _write_avx_d1(out + 12, Data, Prev);
+
+			keys >>= 16;
+			Data = _decode_avx((keys & 0x00FF), &dataPtr);
+			Prev = _write_avx_d1(out + 16, Data, Prev);
+			Data = _decode_avx((keys & 0xFF00) >> 8, &dataPtr);
+			Prev = _write_avx_d1(out + 20, Data, Prev);
+
+			keys >>= 16;
+			Data = _decode_avx((keys & 0x00FF), &dataPtr);
+			Prev = _write_avx_d1(out + 24, Data, Prev);
+			Data = _decode_avx((keys & 0xFF00) >> 8, &dataPtr);
+			Prev = _write_avx_d1(out + 28, Data, Prev);
+
+			out += 32;
+		}
+		{
+			uint64_t keys = nextkeys;
+			// faster 16-bit delta since we only have 8-bit values
+			if (!keys) { // 32 1-byte ints in a row
+				Data = _mm_cvtepu8_epi16(_mm_lddqu_si128((__m128i *) (dataPtr)));
+				Prev = _write_16bit_avx_d1(out, Data, Prev);
+				Data = _mm_cvtepu8_epi16(
+						_mm_lddqu_si128((__m128i *) (dataPtr + 8)));
+				Prev = _write_16bit_avx_d1(out + 8, Data, Prev);
+				Data = _mm_cvtepu8_epi16(
+						_mm_lddqu_si128((__m128i *) (dataPtr + 16)));
+				Prev = _write_16bit_avx_d1(out + 16, Data, Prev);
+				Data = _mm_cvtepu8_epi16(
+						_mm_loadl_epi64((__m128i *) (dataPtr + 24)));
+				Prev = _write_16bit_avx_d1(out + 24, Data, Prev);
+				out += 32;
+				dataPtr += 32;
+
+			} else {
+
+				Data = _decode_avx(keys & 0x00FF, &dataPtr);
+				Prev = _write_avx_d1(out, Data, Prev);
+				Data = _decode_avx((keys & 0xFF00) >> 8, &dataPtr);
+				Prev = _write_avx_d1(out + 4, Data, Prev);
+
+				keys >>= 16;
+				Data = _decode_avx((keys & 0x00FF), &dataPtr);
+				Prev = _write_avx_d1(out + 8, Data, Prev);
+				Data = _decode_avx((keys & 0xFF00) >> 8, &dataPtr);
+				Prev = _write_avx_d1(out + 12, Data, Prev);
+
+				keys >>= 16;
+				Data = _decode_avx((keys & 0x00FF), &dataPtr);
+				Prev = _write_avx_d1(out + 16, Data, Prev);
+				Data = _decode_avx((keys & 0xFF00) >> 8, &dataPtr);
+				Prev = _write_avx_d1(out + 20, Data, Prev);
+
+				keys >>= 16;
+				Data = _decode_avx((keys & 0x00FF), &dataPtr);
+				Prev = _write_avx_d1(out + 24, Data, Prev);
+				Data = _decode_avx((keys & 0xFF00) >> 8, &dataPtr);
+				Prev = _write_avx_d1(out + 28, Data, Prev);
+
+				out += 32;
+			}
+		}
+		prev = out[-1];
+	}
+	uint64_t consumedkeys = keybytes - (keybytes & 7);
+	return svb_decode_scalar_d1_init(out, keyPtr + consumedkeys, dataPtr,
+			count & 31, prev);
+}
+
+size_t streamvbyte_delta_decode(const uint8_t* in, uint32_t* out,
+		uint32_t count, uint32_t prev) {
+	uint32_t keyLen = ((count + 3) / 4); // 2-bits per key (rounded up)
+	const uint8_t *keyPtr = in;
+	const uint8_t *dataPtr = keyPtr + keyLen;  // data starts at end of keys
+	return svb_decode_avx_d1_init(out, keyPtr, dataPtr, count, prev) - in;
+}
--- a/cpp/streamvbyte/tests/unit.c
+++ b/cpp/streamvbyte/tests/unit.c
@@ -0,0 +1,73 @@
+#include <stdio.h>
+#include <stdlib.h>
+
+#include "streamvbyte.h"
+#include "streamvbytedelta.h"
+
+int main() {
+	int N = 4096;
+	uint32_t * datain = malloc(N * sizeof(uint32_t));
+	uint8_t * compressedbuffer = malloc(2 * N * sizeof(uint32_t));
+	uint32_t * recovdata = malloc(N * sizeof(uint32_t));
+
+	for (int length = 0; length <= N;) {
+		printf("length = %d \n", length);
+		for (uint32_t gap = 1; gap <= 387420489; gap *= 3) {
+			for (int k = 0; k < length; ++k)
+				datain[k] = gap;
+			size_t compsize = streamvbyte_encode(datain, length,
+					compressedbuffer);
+			size_t usedbytes = streamvbyte_decode(compressedbuffer, recovdata,
+					length);
+			if (compsize != usedbytes) {
+				printf(
+						"[streamvbyte_decode] code is buggy gap = %d, size mismatch %d %d \n",
+						(int) gap, (int) compsize, (int) usedbytes);
+				return -1;
+			}
+			for (int k = 0; k < length; ++k) {
+				if (recovdata[k] != datain[k]) {
+					printf("[streamvbyte_decode] code is buggy gap = %d\n",
+							(int) gap);
+					return -1;
+				}
+			}
+		}
+
+		printf("Delta \n");
+		for (size_t gap = 1; gap <= 531441; gap *= 3) {
+			for (int k = 0; k < length; ++k)
+				datain[k] = gap * k;
+			size_t compsize = streamvbyte_delta_encode(datain, length,
+					compressedbuffer, 0);
+			size_t usedbytes = streamvbyte_delta_decode(compressedbuffer,
+					recovdata, length, 0);
+			if (compsize != usedbytes) {
+				printf(
+						"[streamvbyte_delta_decode] code is buggy gap = %d, size mismatch %d %d \n",
+						(int) gap, (int) compsize, (int) usedbytes);
+				return -1;
+			}
+			for (int k = 0; k < length; ++k) {
+				if (recovdata[k] != datain[k]) {
+					printf(
+							"[streamvbyte_delta_decode] code is buggy gap = %d\n",
+							(int) gap);
+					return -1;
+				}
+			}
+
+		}
+
+		if (length < 128)
+			++length;
+		else {
+			length *= 2;
+		}
+	}
+	free(datain);
+	free(compressedbuffer);
+	free(recovdata);
+	printf("Code looks good.\n");
+	return 0;
+}
--- a/script/build-doc.sh
+++ b/script/build-doc.sh
@@ -0,0 +1,10 @@
+#!/bin/bash
+DEST=target/doc/tantivy/docs/
+mkdir -p $DEST
+
+for f in $(ls docs/*.md)
+do
+    rustdoc $f -o $DEST --markdown-css ../../rustdoc.css --markdown-css style.css
+done
+
+cp docs/*.css $DEST
--- a/script/profile.sh
+++ b/script/profile.sh
@@ -0,0 +1,5 @@
+#/bin/bash
+valgrind --tool=cachegrind target/release/tantivy-bench -i /data/wiki-index -q ./queries.txt -n 3
+valgrind --tool=callgrind target/release/tantivy-bench -i /data/wiki-index -q ./queries.txt -n 3
+
+
--- a/src/collector/mod.rs
+++ b/src/collector/mod.rs
@@ -105,7 +105,6 @@ pub mod tests {
        offset: DocId,
        segment_max_doc: DocId,
        docs: Vec<DocId>,
-        scores: Vec<Score>,
    }

    impl TestCollector {
@@ -113,19 +112,14 @@ pub mod tests {
        pub fn docs(self) -> Vec<DocId> {
            self.docs
        }
-
-        pub fn scores(self) -> Vec<Score> {
-            self.scores
-        }
    }

    impl Default for TestCollector {
        fn default() -> TestCollector {
            TestCollector {
+                docs: Vec::new(),
                offset: 0,
                segment_max_doc: 0,
-                docs: Vec::new(),
-                scores: Vec::new(),
            }
        }
    }
@@ -137,13 +131,12 @@ pub mod tests {
            Ok(())
        }

-        fn collect(&mut self, doc: DocId, score: Score) {
+        fn collect(&mut self, doc: DocId, _score: Score) {
            self.docs.push(doc + self.offset);
-            self.scores.push(score);
        }

        fn requires_scoring(&self) -> bool {
-            true
+            false
        }
    }

--- a/src/common/bitpacker.rs
+++ b/src/common/bitpacker.rs
@@ -3,7 +3,6 @@ use std::io;
 use common::serialize::BinarySerializable;
 use std::mem;
 use std::ops::Deref;
-use std::ptr;

 pub(crate) struct BitPacker {
    mini_buffer: u64,
@@ -106,18 +105,18 @@ where
                addr + 8 <= data.len(),
                "The fast field field should have been padded with 7 bytes."
            );
-            let val_unshifted_unmasked: u64 = unsafe { ptr::read_unaligned(data[addr..].as_ptr() as *const u64) };
+            let val_unshifted_unmasked: u64 = unsafe { *(data[addr..].as_ptr() as *const u64) };
            let val_shifted = (val_unshifted_unmasked >> bit_shift) as u64;
            val_shifted & mask
        } else {
            let val_unshifted_unmasked: u64 = if addr + 8 <= data.len() {
-                unsafe { ptr::read_unaligned(data[addr..].as_ptr() as *const u64) }
+                unsafe { *(data[addr..].as_ptr() as *const u64) }
            } else {
                let mut buffer = [0u8; 8];
                for i in addr..data.len() {
                    buffer[i - addr] += data[i];
                }
-                unsafe { ptr::read_unaligned(buffer[..].as_ptr() as *const u64) }
+                unsafe { *(buffer[..].as_ptr() as *const u64) }
            };
            let val_shifted = val_unshifted_unmasked >> (bit_shift as u64);
            val_shifted & mask
@@ -141,7 +140,7 @@ where
            for output_val in output.iter_mut() {
                let addr = addr_in_bits >> 3;
                let bit_shift = addr_in_bits & 7;
-                let val_unshifted_unmasked: u64 = unsafe { ptr::read_unaligned(data[addr..].as_ptr() as *const u64) };
+                let val_unshifted_unmasked: u64 = unsafe { *(data[addr..].as_ptr() as *const u64) };
                let val_shifted = (val_unshifted_unmasked >> bit_shift) as u64;
                *output_val = val_shifted & mask;
                addr_in_bits += num_bits;
--- a/src/common/bitset.rs
+++ b/src/common/bitset.rs
@@ -1,5 +1,4 @@
 use std::fmt;
-use std::u64;

 #[derive(Clone, Copy, Eq, PartialEq)]
 pub(crate) struct TinySet(u64);
@@ -85,12 +84,23 @@ impl TinySet {
    /// and removes it.
    #[inline(always)]
    pub fn pop_lowest(&mut self) -> Option<u32> {
+        if let Some(lowest) = self.lowest() {
+            self.0 ^= TinySet::singleton(lowest).0;
+            Some(lowest)
+        } else {
+            None
+        }
+    }
+
+    /// Returns the lowest element in the `TinySet`
+    /// (or None if the set is empty).
+    #[inline(always)]
+    pub fn lowest(&mut self) -> Option<u32> {
        if self.is_empty() {
            None
        } else {
-            let lowest = self.0.trailing_zeros() as u32;
-            self.0 ^= TinySet::singleton(lowest).0;
-            Some(lowest)
+            let least_significant_bit = self.0.trailing_zeros() as u32;
+            Some(least_significant_bit)
        }
    }

@@ -356,15 +366,7 @@ mod tests {

    #[bench]
    fn bench_tinyset_pop(b: &mut test::Bencher) {
-        b.iter(|| {
-            let mut tinyset = TinySet::singleton(test::black_box(31u32));
-            tinyset.pop_lowest();
-            tinyset.pop_lowest();
-            tinyset.pop_lowest();
-            tinyset.pop_lowest();
-            tinyset.pop_lowest();
-            tinyset.pop_lowest();
-        });
+        b.iter(|| test::black_box(TinySet::singleton(31u32)).pop_lowest());
    }

    #[bench]
@@ -385,5 +387,4 @@ mod tests {
    fn bench_bitset_initialize(b: &mut test::Bencher) {
        b.iter(|| BitSet::with_max_value(1_000_000));
    }
-
 }
--- a/src/common/mod.rs
+++ b/src/common/mod.rs
@@ -1,4 +1,5 @@
-    mod serialize;
+mod serialize;
+mod timer;
 mod vint;
 mod counting_writer;
 mod composite_file;
@@ -7,6 +8,9 @@ mod bitset;

 pub(crate) use self::composite_file::{CompositeFile, CompositeWrite};
 pub use self::serialize::{BinarySerializable, FixedSize};
+pub use self::timer::Timing;
+pub use self::timer::TimerTree;
+pub use self::timer::OpenTimer;
 pub use self::vint::VInt;
 pub use self::counting_writer::CountingWriter;
 pub use self::bitset::BitSet;
--- a/src/common/timer.rs
+++ b/src/common/timer.rs
@@ -0,0 +1,99 @@
+use time::PreciseTime;
+
+pub struct OpenTimer<'a> {
+    name: &'static str,
+    timer_tree: &'a mut TimerTree,
+    start: PreciseTime,
+    depth: u32,
+}
+
+impl<'a> OpenTimer<'a> {
+    /// Starts timing a new named subtask
+    ///
+    /// The timer is stopped automatically
+    /// when the `OpenTimer` is dropped.
+    pub fn open(&mut self, name: &'static str) -> OpenTimer {
+        OpenTimer {
+            name,
+            timer_tree: self.timer_tree,
+            start: PreciseTime::now(),
+            depth: self.depth + 1,
+        }
+    }
+}
+
+impl<'a> Drop for OpenTimer<'a> {
+    fn drop(&mut self) {
+        self.timer_tree.timings.push(Timing {
+            name: self.name,
+            duration: self.start
+                .to(PreciseTime::now())
+                .num_microseconds()
+                .unwrap(),
+            depth: self.depth,
+        });
+    }
+}
+
+/// Timing recording
+#[derive(Debug, Serialize)]
+pub struct Timing {
+    name: &'static str,
+    duration: i64,
+    depth: u32,
+}
+
+/// Timer tree
+#[derive(Debug, Serialize)]
+pub struct TimerTree {
+    timings: Vec<Timing>,
+}
+
+impl TimerTree {
+    /// Returns the total time elapsed in microseconds
+    pub fn total_time(&self) -> i64 {
+        self.timings.last().unwrap().duration
+    }
+
+    /// Open a new named subtask
+    pub fn open(&mut self, name: &'static str) -> OpenTimer {
+        OpenTimer {
+            name,
+            timer_tree: self,
+            start: PreciseTime::now(),
+            depth: 0,
+        }
+    }
+}
+
+impl Default for TimerTree {
+    fn default() -> TimerTree {
+        TimerTree {
+            timings: Vec::new(),
+        }
+    }
+}
+
+#[cfg(test)]
+mod tests {
+
+    use super::*;
+
+    #[test]
+    fn test_timer() {
+        let mut timer_tree = TimerTree::default();
+        {
+            let mut a = timer_tree.open("a");
+            {
+                let mut ab = a.open("b");
+                {
+                    let _abc = ab.open("c");
+                }
+                {
+                    let _abd = ab.open("d");
+                }
+            }
+        }
+        assert_eq!(timer_tree.timings.len(), 4);
+    }
+}
--- a/src/compression/mod.rs
+++ b/src/compression/mod.rs
@@ -3,97 +3,41 @@

 mod stream;

-pub const COMPRESSION_BLOCK_SIZE: usize = 128;
-const COMPRESSED_BLOCK_MAX_SIZE: usize = COMPRESSION_BLOCK_SIZE * 4 + 1;
-
 pub use self::stream::CompressedIntStream;

-
-use bitpacking::{BitPacker, BitPacker4x};
-
+pub const COMPRESSION_BLOCK_SIZE: usize = 128;

 /// Returns the size in bytes of a compressed block, given `num_bits`.
 pub fn compressed_block_size(num_bits: u8) -> usize {
-    1 + (num_bits as usize) * COMPRESSION_BLOCK_SIZE / 8
+    1 + (num_bits as usize) * 16
 }

-pub struct BlockEncoder {
-    bitpacker: BitPacker4x,
-    pub output: [u8; COMPRESSED_BLOCK_MAX_SIZE],
-    pub output_len: usize,
+#[cfg(not(feature = "simdcompression"))]
+mod pack {
+    mod compression_pack_nosimd;
+    pub use self::compression_pack_nosimd::{BlockDecoder, BlockEncoder};
 }

-impl BlockEncoder {
-    pub fn new() -> BlockEncoder {
-        BlockEncoder {
-            bitpacker: BitPacker4x::new(),
-            output: [0u8; COMPRESSED_BLOCK_MAX_SIZE],
-            output_len: 0,
-        }
-    }
-
-    pub fn compress_block_sorted(&mut self, block: &[u32], offset: u32) -> &[u8] {
-        let num_bits = self.bitpacker.num_bits_sorted(offset, block);
-        self.output[0] = num_bits;
-        let written_size = 1 + self.bitpacker.compress_sorted(offset, block, &mut self.output[1..], num_bits);
-        &self.output[..written_size]
-    }
-
-    pub fn compress_block_unsorted(&mut self, block: &[u32]) -> &[u8] {
-        let num_bits = self.bitpacker.num_bits(block);
-        self.output[0] = num_bits;
-        let written_size = 1 + self.bitpacker.compress(block, &mut self.output[1..], num_bits);
-        &self.output[..written_size]
-    }
+#[cfg(feature = "simdcompression")]
+mod pack {
+    mod compression_pack_simd;
+    pub use self::compression_pack_simd::{BlockDecoder, BlockEncoder};
 }

+pub use self::pack::{BlockDecoder, BlockEncoder};

-pub struct BlockDecoder {
-    bitpacker: BitPacker4x,
-    pub output: [u32; COMPRESSION_BLOCK_SIZE + 1],
-    pub output_len: usize,
+#[cfg(any(not(feature = "simdcompression"), target_env = "msvc"))]
+mod vint {
+    mod compression_vint_nosimd;
+    pub(crate) use self::compression_vint_nosimd::*;
 }

-impl BlockDecoder {
-    pub fn new() -> BlockDecoder {
-        BlockDecoder::with_val(0u32)
-    }
-
-    pub fn with_val(val: u32) -> BlockDecoder {
-        let mut output = [val; COMPRESSION_BLOCK_SIZE + 1];
-        output[COMPRESSION_BLOCK_SIZE] = 0u32;
-        BlockDecoder {
-            bitpacker: BitPacker4x::new(),
-            output,
-            output_len: 0,
-        }
-    }
-    
-    pub fn uncompress_block_sorted(&mut self, compressed_data: &[u8], offset: u32) -> usize {
-        let num_bits = compressed_data[0];
-        self.output_len = COMPRESSION_BLOCK_SIZE;
-        1 + self.bitpacker.decompress_sorted(offset, &compressed_data[1..], &mut self.output, num_bits)
-    }
-
-    pub fn uncompress_block_unsorted<'a>(&mut self, compressed_data: &'a [u8]) -> usize {
-        let num_bits = compressed_data[0];
-        self.output_len = COMPRESSION_BLOCK_SIZE;
-        1 + self.bitpacker.decompress(&compressed_data[1..], &mut self.output, num_bits)
-    }
-
-    #[inline]
-    pub fn output_array(&self) -> &[u32] {
-        &self.output[..self.output_len]
-    }
-
-    #[inline]
-    pub fn output(&self, idx: usize) -> u32 {
-        self.output[idx]
-    }
+#[cfg(all(feature = "simdcompression", not(target_env = "msvc")))]
+mod vint {
+    mod compression_vint_simd;
+    pub(crate) use self::compression_vint_simd::*;
 }

-mod vint;
-
 pub trait VIntEncoder {
    /// Compresses an array of `u32` integers,
    /// using [delta-encoding](https://en.wikipedia.org/wiki/Delta_encoding)
--- a/src/compression/pack/compression_pack_nosimd.rs
+++ b/src/compression/pack/compression_pack_nosimd.rs
@@ -0,0 +1,148 @@
+use common::compute_num_bits;
+use common::bitpacker::{BitPacker, BitUnpacker};
+use common::CountingWriter;
+use std::cmp;
+use std::io::Write;
+use super::super::{compressed_block_size, COMPRESSION_BLOCK_SIZE};
+
+const COMPRESSED_BLOCK_MAX_SIZE: usize = COMPRESSION_BLOCK_SIZE * 4 + 1;
+
+pub fn compress_sorted(vals: &mut [u32], output: &mut [u8], offset: u32) -> usize {
+    let mut max_delta = 0;
+    {
+        let mut local_offset = offset;
+        for i in 0..COMPRESSION_BLOCK_SIZE {
+            let val = vals[i];
+            let delta = val - local_offset;
+            max_delta = cmp::max(max_delta, delta);
+            vals[i] = delta;
+            local_offset = val;
+        }
+    }
+    let mut counting_writer = CountingWriter::wrap(output);
+    let num_bits = compute_num_bits(max_delta as u64);
+    counting_writer.write_all(&[num_bits]).unwrap();
+
+    let mut bit_packer = BitPacker::new();
+    for val in vals {
+        bit_packer
+            .write(*val as u64, num_bits, &mut counting_writer)
+            .unwrap();
+    }
+    let compressed_size = counting_writer.written_bytes();
+    assert_eq!(compressed_size, compressed_block_size(num_bits));
+    compressed_size
+}
+
+pub struct BlockEncoder {
+    pub output: [u8; COMPRESSED_BLOCK_MAX_SIZE],
+    pub output_len: usize,
+    input_buffer: [u32; COMPRESSION_BLOCK_SIZE],
+}
+
+impl BlockEncoder {
+    pub fn new() -> BlockEncoder {
+        BlockEncoder {
+            output: [0u8; COMPRESSED_BLOCK_MAX_SIZE],
+            output_len: 0,
+            input_buffer: [0u32; COMPRESSION_BLOCK_SIZE],
+        }
+    }
+
+    pub fn compress_block_sorted(&mut self, vals: &[u32], offset: u32) -> &[u8] {
+        self.input_buffer.clone_from_slice(vals);
+        let compressed_size = compress_sorted(&mut self.input_buffer, &mut self.output, offset);
+        &self.output[..compressed_size]
+    }
+
+    pub fn compress_block_unsorted(&mut self, vals: &[u32]) -> &[u8] {
+        let compressed_size = {
+            let output: &mut [u8] = &mut self.output;
+            let max = vals.iter()
+                .cloned()
+                .max()
+                .expect("compress unsorted called with an empty array");
+            let num_bits = compute_num_bits(max as u64);
+            let mut counting_writer = CountingWriter::wrap(output);
+            counting_writer.write_all(&[num_bits]).unwrap();
+            let mut bit_packer = BitPacker::new();
+            for val in vals {
+                bit_packer
+                    .write(*val as u64, num_bits, &mut counting_writer)
+                    .unwrap();
+            }
+            for _ in vals.len()..COMPRESSION_BLOCK_SIZE {
+                bit_packer
+                    .write(vals[0] as u64, num_bits, &mut counting_writer)
+                    .unwrap();
+            }
+            bit_packer.flush(&mut counting_writer).expect(
+                "Flushing the bitpacking \
+                 in an in RAM buffer should never fail",
+            );
+            // we avoid writing "closing", because we
+            // do not want 7 bytes of padding here.
+            counting_writer.written_bytes()
+        };
+        &self.output[..compressed_size]
+    }
+}
+
+pub struct BlockDecoder {
+    pub output: [u32; COMPRESSED_BLOCK_MAX_SIZE],
+    pub output_len: usize,
+}
+
+impl BlockDecoder {
+    pub fn new() -> BlockDecoder {
+        BlockDecoder::with_val(0u32)
+    }
+
+    pub fn with_val(val: u32) -> BlockDecoder {
+        BlockDecoder {
+            output: [val; COMPRESSED_BLOCK_MAX_SIZE],
+            output_len: 0,
+        }
+    }
+
+    pub fn uncompress_block_sorted<'a>(
+        &mut self,
+        compressed_data: &'a [u8],
+        mut offset: u32,
+    ) -> usize {
+        let consumed_size = {
+            let num_bits = compressed_data[0];
+            let bit_unpacker = BitUnpacker::new(&compressed_data[1..], num_bits);
+            for i in 0..COMPRESSION_BLOCK_SIZE {
+                let delta = bit_unpacker.get(i);
+                let val = offset + delta as u32;
+                self.output[i] = val;
+                offset = val;
+            }
+            compressed_block_size(num_bits)
+        };
+        self.output_len = COMPRESSION_BLOCK_SIZE;
+        consumed_size
+    }
+
+    pub fn uncompress_block_unsorted<'a>(&mut self, compressed_data: &'a [u8]) -> usize {
+        let num_bits = compressed_data[0];
+        let bit_unpacker = BitUnpacker::new(&compressed_data[1..], num_bits);
+        for i in 0..COMPRESSION_BLOCK_SIZE {
+            self.output[i] = bit_unpacker.get(i) as u32;
+        }
+        let consumed_size = 1 + (num_bits as usize * COMPRESSION_BLOCK_SIZE + 7) / 8;
+        self.output_len = COMPRESSION_BLOCK_SIZE;
+        consumed_size
+    }
+
+    #[inline]
+    pub fn output_array(&self) -> &[u32] {
+        &self.output[..self.output_len]
+    }
+
+    #[inline]
+    pub fn output(&self, idx: usize) -> u32 {
+        self.output[idx]
+    }
+}
--- a/src/compression/pack/compression_pack_simd.rs
+++ b/src/compression/pack/compression_pack_simd.rs
@@ -0,0 +1,116 @@
+use compression::COMPRESSION_BLOCK_SIZE;
+
+const COMPRESSED_BLOCK_MAX_SIZE: usize = COMPRESSION_BLOCK_SIZE * 4 + 1;
+
+mod simdcomp {
+    use libc::size_t;
+
+    extern "C" {
+        pub fn compress_sorted(data: *const u32, output: *mut u8, offset: u32) -> size_t;
+
+        pub fn uncompress_sorted(
+            compressed_data: *const u8,
+            output: *mut u32,
+            offset: u32,
+        ) -> size_t;
+
+        pub fn compress_unsorted(data: *const u32, output: *mut u8) -> size_t;
+
+        pub fn uncompress_unsorted(compressed_data: *const u8, output: *mut u32) -> size_t;
+    }
+}
+
+fn compress_sorted(vals: &[u32], output: &mut [u8], offset: u32) -> usize {
+    unsafe { simdcomp::compress_sorted(vals.as_ptr(), output.as_mut_ptr(), offset) }
+}
+
+fn uncompress_sorted(compressed_data: &[u8], output: &mut [u32], offset: u32) -> usize {
+    unsafe { simdcomp::uncompress_sorted(compressed_data.as_ptr(), output.as_mut_ptr(), offset) }
+}
+
+fn compress_unsorted(vals: &[u32], output: &mut [u8]) -> usize {
+    unsafe { simdcomp::compress_unsorted(vals.as_ptr(), output.as_mut_ptr()) }
+}
+
+fn uncompress_unsorted(compressed_data: &[u8], output: &mut [u32]) -> usize {
+    unsafe { simdcomp::uncompress_unsorted(compressed_data.as_ptr(), output.as_mut_ptr()) }
+}
+
+pub struct BlockEncoder {
+    pub output: [u8; COMPRESSED_BLOCK_MAX_SIZE],
+    pub output_len: usize,
+}
+
+impl BlockEncoder {
+    pub fn new() -> BlockEncoder {
+        BlockEncoder {
+            output: [0u8; COMPRESSED_BLOCK_MAX_SIZE],
+            output_len: 0,
+        }
+    }
+
+    pub fn compress_block_sorted(&mut self, vals: &[u32], offset: u32) -> &[u8] {
+        let compressed_size = compress_sorted(vals, &mut self.output, offset);
+        &self.output[..compressed_size]
+    }
+
+    pub fn compress_block_unsorted(&mut self, vals: &[u32]) -> &[u8] {
+        let compressed_size = compress_unsorted(vals, &mut self.output);
+        &self.output[..compressed_size]
+    }
+}
+
+pub struct BlockDecoder {
+    pub output: [u32; COMPRESSED_BLOCK_MAX_SIZE],
+    pub output_len: usize,
+}
+
+impl BlockDecoder {
+    pub fn new() -> BlockDecoder {
+        BlockDecoder::with_val(0u32)
+    }
+
+    pub fn with_val(val: u32) -> BlockDecoder {
+        BlockDecoder {
+            output: [val; COMPRESSED_BLOCK_MAX_SIZE],
+            output_len: 0,
+        }
+    }
+
+    pub fn uncompress_block_sorted(&mut self, compressed_data: &[u8], offset: u32) -> usize {
+        let consumed_size = uncompress_sorted(compressed_data, &mut self.output, offset);
+        self.output_len = COMPRESSION_BLOCK_SIZE;
+        consumed_size
+    }
+
+    pub fn uncompress_block_unsorted<'a>(&mut self, compressed_data: &'a [u8]) -> usize {
+        let consumed_size = uncompress_unsorted(compressed_data, &mut self.output);
+        self.output_len = COMPRESSION_BLOCK_SIZE;
+        consumed_size
+    }
+
+    #[inline]
+    pub fn output_array(&self) -> &[u32] {
+        &self.output[..self.output_len]
+    }
+
+    #[inline]
+    pub fn output(&self, idx: usize) -> u32 {
+        self.output[idx]
+    }
+}
+
+#[cfg(test)]
+mod tests {
+
+    use super::BlockEncoder;
+
+    #[test]
+    fn test_all_docs_compression_len() {
+        let data: Vec<u32> = (0u32..128u32).collect();
+        let mut encoder = BlockEncoder::new();
+        let compressed = encoder.compress_block_sorted(&data, 0u32);
+        assert_eq!(compressed.len(), 17);
+    }
+
+}
--- a/src/compression/stream.rs
+++ b/src/compression/stream.rs
@@ -11,12 +11,7 @@ use directory::{ReadOnlySource, SourceRead};
 /// decompressing blocks that are not required.
 pub struct CompressedIntStream {
    buffer: SourceRead,
-
    block_decoder: BlockDecoder,
-    cached_addr: usize, // address of the currently decoded block
-    cached_next_addr: usize, // address following the currently decoded block
-
-    addr: usize, // address of the block associated to the current position
    inner_offset: usize,
 }

@@ -26,47 +21,34 @@ impl CompressedIntStream {
        CompressedIntStream {
            buffer: SourceRead::from(source),
            block_decoder: BlockDecoder::new(),
-            cached_addr: usize::max_value(),
-            cached_next_addr: usize::max_value(),
-
-            addr: 0,
-            inner_offset: 0,
+            inner_offset: COMPRESSION_BLOCK_SIZE,
        }
    }

-    /// Loads the block at the given address and return the address of the
-    /// following block
-    pub fn read_block(&mut self, addr: usize) -> usize {
-        if self.cached_addr == addr {
-            // we are already on this block.
-            // no need to read.
-            self.cached_next_addr
-        } else {
-            let next_addr = addr + self.block_decoder.uncompress_block_unsorted(self.buffer.slice_from(addr));
-            self.cached_addr = addr;
-            self.cached_next_addr = next_addr;
-            next_addr
-        }
-    }
-
-    /// Fills a buffer with the next `output.len()` integers.
-    /// This does not consume / advance the stream.
+    /// Fills a buffer with the next `output.len()` integers,
+    /// and advance the stream by that many els.
    pub fn read(&mut self, output: &mut [u32]) {
-        let mut cursor = self.addr;
-        let mut inner_offset = self.inner_offset;
        let mut num_els: usize = output.len();
-        let mut start = 0;
+        let mut start: usize = 0;
        loop {
-            cursor = self.read_block(cursor);
-            let block = &self.block_decoder.output_array()[inner_offset..];
-            let block_len = block.len();
-            if num_els >= block_len {
-                output[start..start + block_len].clone_from_slice(&block);
-                start += block_len;
-                num_els -= block_len;
-                inner_offset = 0;
+            let available = COMPRESSION_BLOCK_SIZE - self.inner_offset;
+            if num_els >= available {
+                if available > 0 {
+                    let uncompressed_block =
+                        &self.block_decoder.output_array()[self.inner_offset..];
+                    output[start..][..available].clone_from_slice(uncompressed_block);
+                }
+                num_els -= available;
+                start += available;
+                let num_consumed_bytes = self.block_decoder
+                    .uncompress_block_unsorted(self.buffer.as_ref());
+                self.buffer.advance(num_consumed_bytes);
+                self.inner_offset = 0;
            } else {
-                output[start..].clone_from_slice(&block[..num_els]);
+                let uncompressed_block = &self.block_decoder.output_array()
+                    [self.inner_offset..self.inner_offset + num_els];
+                output[start..][..num_els].clone_from_slice(uncompressed_block);
+                self.inner_offset += num_els;
                break;
            }
        }
@@ -76,22 +58,23 @@ impl CompressedIntStream {
    ///
    /// If a full block is skipped, calling
    /// `.skip(...)` will avoid decompressing it.
-    ///
-    /// May panic if the end of the stream is reached.
    pub fn skip(&mut self, mut skip_len: usize) {
-        loop {
-            let available = COMPRESSION_BLOCK_SIZE - self.inner_offset;
-            if available >= skip_len {
-                self.inner_offset += skip_len;
-                break;
-            } else {
-                skip_len -= available;
-                // entirely skip decompressing some blocks.
-                let num_bits: u8 = self.buffer.get(self.addr);
+        let available = COMPRESSION_BLOCK_SIZE - self.inner_offset;
+        if available >= skip_len {
+            self.inner_offset += skip_len;
+        } else {
+            skip_len -= available;
+            // entirely skip decompressing some blocks.
+            while skip_len >= COMPRESSION_BLOCK_SIZE {
+                skip_len -= COMPRESSION_BLOCK_SIZE;
+                let num_bits: u8 = self.buffer.as_ref()[0];
                let block_len = compressed_block_size(num_bits);
-                self.addr += block_len;
-                self.inner_offset = 0;
+                self.buffer.advance(block_len);
            }
+            let num_consumed_bytes = self.block_decoder
+                .uncompress_block_unsorted(self.buffer.as_ref());
+            self.buffer.advance(num_consumed_bytes);
+            self.inner_offset = skip_len;
        }
    }
 }
@@ -108,7 +91,7 @@ pub mod tests {
    fn create_stream_buffer() -> ReadOnlySource {
        let mut buffer: Vec<u8> = vec![];
        let mut encoder = BlockEncoder::new();
-        let vals: Vec<u32> = (0u32..1152u32).collect();
+        let vals: Vec<u32> = (0u32..1_025u32).collect();
        for chunk in vals.chunks(COMPRESSION_BLOCK_SIZE) {
            let compressed_block = encoder.compress_block_unsorted(chunk);
            let num_bits = compressed_block[0];
@@ -130,24 +113,13 @@ pub mod tests {
        stream.read(&mut block[0..2]);
        assert_eq!(block[0], 0);
        assert_eq!(block[1], 1);
-
-        // reading does not consume the stream
-        stream.read(&mut block[0..2]);
-        assert_eq!(block[0], 0);
-        assert_eq!(block[1], 1);
-        stream.skip(2);
-
        stream.skip(5);
        stream.read(&mut block[0..3]);
-        stream.skip(3);
-
        assert_eq!(block[0], 7);
        assert_eq!(block[1], 8);
        assert_eq!(block[2], 9);
        stream.skip(500);
        stream.read(&mut block[0..3]);
-        stream.skip(3);
-
        assert_eq!(block[0], 510);
        assert_eq!(block[1], 511);
        assert_eq!(block[2], 512);
--- a/src/compression/vint/compression_vint_nosimd.rs
+++ b/src/compression/vint/compression_vint_nosimd.rs
--- a/src/compression/vint/compression_vint_simd.rs
+++ b/src/compression/vint/compression_vint_simd.rs
@@ -0,0 +1,72 @@
+mod streamvbyte {
+
+    use libc::size_t;
+
+    extern "C" {
+        pub fn streamvbyte_delta_encode(
+            data: *const u32,
+            num_els: u32,
+            output: *mut u8,
+            offset: u32,
+        ) -> size_t;
+
+        pub fn streamvbyte_delta_decode(
+            compressed_data: *const u8,
+            output: *mut u32,
+            num_els: u32,
+            offset: u32,
+        ) -> size_t;
+
+        pub fn streamvbyte_encode(data: *const u32, num_els: u32, output: *mut u8) -> size_t;
+
+        pub fn streamvbyte_decode(
+            compressed_data: *const u8,
+            output: *mut u32,
+            num_els: usize,
+        ) -> size_t;
+    }
+}
+
+#[inline(always)]
+pub(crate) fn compress_sorted<'a>(input: &[u32], output: &'a mut [u8], offset: u32) -> &'a [u8] {
+    let compress_length = unsafe {
+        streamvbyte::streamvbyte_delta_encode(
+            input.as_ptr(),
+            input.len() as u32,
+            output.as_mut_ptr(),
+            offset,
+        )
+    };
+    &output[..compress_length]
+}
+
+#[inline(always)]
+pub(crate) fn compress_unsorted<'a>(input: &[u32], output: &'a mut [u8]) -> &'a [u8] {
+    let compress_length = unsafe {
+        streamvbyte::streamvbyte_encode(input.as_ptr(), input.len() as u32, output.as_mut_ptr())
+    };
+    &output[..compress_length]
+}
+
+#[inline(always)]
+pub(crate) fn uncompress_sorted<'a>(
+    compressed_data: &'a [u8],
+    output: &mut [u32],
+    offset: u32,
+) -> usize {
+    unsafe {
+        streamvbyte::streamvbyte_delta_decode(
+            compressed_data.as_ptr(),
+            output.as_mut_ptr(),
+            output.len() as u32,
+            offset,
+        )
+    }
+}
+
+#[inline(always)]
+pub(crate) fn uncompress_unsorted<'a>(compressed_data: &'a [u8], output: &mut [u32]) -> usize {
+    unsafe {
+        streamvbyte::streamvbyte_decode(compressed_data.as_ptr(), output.as_mut_ptr(), output.len())
+    }
+}
--- a/src/core/index.rs
+++ b/src/core/index.rs
@@ -6,11 +6,7 @@ use std::sync::Arc;
 use std::borrow::BorrowMut;
 use std::fmt;
 use core::SegmentId;
-
-
-#[cfg(feature="mmap")]
-use directory::MmapDirectory;
-use directory::{Directory, RAMDirectory};
+use directory::{Directory, MmapDirectory, RAMDirectory};
 use indexer::index_writer::open_index_writer;
 use core::searcher::Searcher;
 use std::convert::From;
@@ -65,7 +61,6 @@ impl Index {
    /// The index will use the `MMapDirectory`.
    ///
    /// If a previous index was in this directory, then its meta file will be destroyed.
-    #[cfg(feature="mmap")]
    pub fn create<P: AsRef<Path>>(directory_path: P, schema: Schema) -> Result<Index> {
        let mmap_directory = MmapDirectory::open(directory_path)?;
        let directory = ManagedDirectory::new(mmap_directory)?;
@@ -85,8 +80,6 @@ impl Index {
    ///
    /// The temp directory is only used for testing the `MmapDirectory`.
    /// For other unit tests, prefer the `RAMDirectory`, see: `create_in_ram`.
-    #[cfg(feature="mmap")]
-    #[cfg(test)]
    pub fn create_from_tempdir(schema: Schema) -> Result<Index> {
        let mmap_directory = MmapDirectory::create_from_tempdir()?;
        let directory = ManagedDirectory::new(mmap_directory)?;
@@ -114,7 +107,6 @@ impl Index {
    }

    /// Opens a new directory from an index path.
-    #[cfg(feature="mmap")]
    pub fn open<P: AsRef<Path>>(directory_path: P) -> Result<Index> {
        let mmap_directory = MmapDirectory::open(directory_path)?;
        let directory = ManagedDirectory::new(mmap_directory)?;
@@ -122,13 +114,6 @@ impl Index {
        Index::create_from_metas(directory, &metas)
    }

-    pub fn open_directory<TDirectory: Directory>(directory: TDirectory) -> Result<Index> {
-        let directory = ManagedDirectory::new(directory)?;
-        let metas = load_metas(&directory)?;
-        Index::create_from_metas(directory, &metas)
-    }
-
-
    /// Reads the index meta file from the directory.
    pub fn load_metas(&self) -> Result<IndexMeta> {
        load_metas(self.directory())
--- a/src/core/inverted_index_reader.rs
+++ b/src/core/inverted_index_reader.rs
@@ -4,9 +4,9 @@ use postings::{BlockSegmentPostings, SegmentPostings};
 use postings::TermInfo;
 use schema::IndexRecordOption;
 use schema::Term;
+use fastfield::DeleteBitSet;
 use compression::CompressedIntStream;
 use postings::FreqReadingOption;
-use common::BinarySerializable;
 use schema::FieldType;

 /// The inverted index reader is in charge of accessing
@@ -26,8 +26,8 @@ pub struct InvertedIndexReader {
    termdict: TermDictionaryImpl,
    postings_source: ReadOnlySource,
    positions_source: ReadOnlySource,
+    delete_bitset: DeleteBitSet,
    record_option: IndexRecordOption,
-    total_num_tokens: u64
 }

 impl InvertedIndexReader {
@@ -35,17 +35,15 @@ impl InvertedIndexReader {
        termdict: TermDictionaryImpl,
        postings_source: ReadOnlySource,
        positions_source: ReadOnlySource,
+        delete_bitset: DeleteBitSet,
        record_option: IndexRecordOption,
    ) -> InvertedIndexReader {
-        let total_num_tokens_data = postings_source.slice(0, 8);
-        let mut total_num_tokens_cursor = total_num_tokens_data.as_slice();
-        let total_num_tokens = u64::deserialize(&mut total_num_tokens_cursor).unwrap_or(0u64);
        InvertedIndexReader {
            termdict,
-            postings_source: postings_source.slice_from(8),
+            postings_source,
            positions_source,
+            delete_bitset,
            record_option,
-            total_num_tokens
        }
    }

@@ -55,13 +53,13 @@ impl InvertedIndexReader {
        let record_option = field_type
            .get_index_record_option()
            .unwrap_or(IndexRecordOption::Basic);
-        InvertedIndexReader {
-            termdict:    TermDictionaryImpl::empty(field_type),
-            postings_source: ReadOnlySource::empty(),
-            positions_source: ReadOnlySource::empty(),
+        InvertedIndexReader::new(
+            TermDictionaryImpl::empty(field_type),
+            ReadOnlySource::empty(),
+            ReadOnlySource::empty(),
+            DeleteBitSet::empty(),
            record_option,
-            total_num_tokens: 0u64
-        }
+        )
    }

    /// Returns the term info associated with the term.
@@ -129,6 +127,7 @@ impl InvertedIndexReader {
        option: IndexRecordOption,
    ) -> SegmentPostings {
        let block_postings = self.read_block_postings_from_terminfo(term_info, option);
+        let delete_bitset = self.delete_bitset.clone();
        let position_stream = {
            if option.has_positions() {
                let position_offset = term_info.positions_offset;
@@ -140,17 +139,9 @@ impl InvertedIndexReader {
                None
            }
        };
-        SegmentPostings::from_block_postings(block_postings, position_stream)
+        SegmentPostings::from_block_postings(block_postings, delete_bitset, position_stream)
    }

-    /// Returns the total number of tokens recorded for all documents
-    /// (including deleted documents).
-    pub fn total_num_tokens(&self) -> u64 {
-        self.total_num_tokens
-    }
-
-
-
    /// Returns the segment postings associated with the term, and with the given option,
    /// or `None` if the term has never been encountered and indexed.
    ///
@@ -166,12 +157,6 @@ impl InvertedIndexReader {
        Some(self.read_postings_from_terminfo(&term_info, option))
    }

-    pub(crate) fn read_postings_no_deletes(&self, term: &Term, option: IndexRecordOption) -> Option<SegmentPostings> {
-        let term_info = get!(self.get_term_info(term));
-        Some(self.read_postings_from_terminfo(&term_info, option))
-    }
-
-
    /// Returns the number of documents containing the term.
    pub fn doc_freq(&self, term: &Term) -> u32 {
        self.get_term_info(term)
@@ -179,6 +164,3 @@ impl InvertedIndexReader {
            .unwrap_or(0u32)
    }
 }
-
-
-
--- a/src/core/searcher.rs
+++ b/src/core/searcher.rs
@@ -2,7 +2,9 @@ use Result;
 use core::SegmentReader;
 use schema::Document;
 use collector::Collector;
+use common::TimerTree;
 use query::Query;
+use DocId;
 use DocAddress;
 use schema::{Field, Term};
 use termdict::{TermDictionary, TermMerger};
@@ -31,20 +33,20 @@ impl Searcher {
    }

    /// Returns the overall number of documents in the index.
-    pub fn num_docs(&self) -> u64 {
+    pub fn num_docs(&self) -> DocId {
        self.segment_readers
            .iter()
-            .map(|segment_reader| segment_reader.num_docs() as u64)
-            .sum::<u64>()
+            .map(|segment_reader| segment_reader.num_docs())
+            .sum::<u32>()
    }

    /// Return the overall number of documents containing
    /// the given term.
-    pub fn doc_freq(&self, term: &Term) -> u64 {
+    pub fn doc_freq(&self, term: &Term) -> u32 {
        self.segment_readers
            .iter()
-            .map(|segment_reader| segment_reader.inverted_index(term.field()).doc_freq(term) as u64)
-            .sum::<u64>()
+            .map(|segment_reader| segment_reader.inverted_index(term.field()).doc_freq(term))
+            .sum::<u32>()
    }

    /// Return the list of segment readers
@@ -58,7 +60,7 @@ impl Searcher {
    }

    /// Runs a query on the segment readers wrapped by the searcher
-    pub fn search<C: Collector>(&self, query: &Query, collector: &mut C) -> Result<()> {
+    pub fn search<C: Collector>(&self, query: &Query, collector: &mut C) -> Result<TimerTree> {
        query.search(self, collector)
    }

--- a/src/core/segment_reader.rs
+++ b/src/core/segment_reader.rs
@@ -25,7 +25,6 @@ use schema::Schema;
 use termdict::TermDictionary;
 use fastfield::{FastValue, MultiValueIntFastFieldReader};
 use schema::Cardinality;
-use fieldnorm::FieldNormReader;

 /// Entry point to access all of the datastructures of the `Segment`
 ///
@@ -54,7 +53,7 @@ pub struct SegmentReader {
    fieldnorms_composite: CompositeFile,

    store_reader: StoreReader,
-    delete_bitset_opt: Option<DeleteBitSet>,
+    delete_bitset: DeleteBitSet,
    schema: Schema,
 }

@@ -79,14 +78,7 @@ impl SegmentReader {
    /// Return the number of documents that have been
    /// deleted in the segment.
    pub fn num_deleted_docs(&self) -> DocId {
-        self.delete_bitset()
-            .map(|delete_set| delete_set.len() as DocId)
-            .unwrap_or(0u32)
-    }
-
-    /// Returns true iff some of the documents of the segment have been deleted.
-    pub fn has_deletes(&self) -> bool {
-        self.delete_bitset().is_some()
+        self.delete_bitset.len() as DocId
    }

    /// Accessor to a segment's fast field reader given a field.
@@ -105,12 +97,25 @@ impl SegmentReader {
    ) -> fastfield::Result<FastFieldReader<Item>> {
        let field_entry = self.schema.get_field_entry(field);
        if Item::fast_field_cardinality(field_entry.field_type()) == Some(Cardinality::SingleValue)
-            {
-                self.fast_fields_composite
-                    .open_read(field)
-                    .ok_or_else(|| FastFieldNotAvailableError::new(field_entry))
-                    .map(FastFieldReader::open)
-            } else {
+        {
+            self.fast_fields_composite
+                .open_read(field)
+                .ok_or_else(|| FastFieldNotAvailableError::new(field_entry))
+                .map(FastFieldReader::open)
+        } else {
+            Err(FastFieldNotAvailableError::new(field_entry))
+        }
+    }
+
+    pub(crate) fn fast_field_reader_with_idx<Item: FastValue>(
+        &self,
+        field: Field,
+        idx: usize
+    ) -> fastfield::Result<FastFieldReader<Item>> {
+        if let Some(ff_source) = self.fast_fields_composite.open_read_with_idx(field, idx) {
+            Ok(FastFieldReader::open(ff_source))
+        } else {
+            let field_entry = self.schema.get_field_entry(field);
            Err(FastFieldNotAvailableError::new(field_entry))
        }
    }
@@ -123,17 +128,11 @@ impl SegmentReader {
    ) -> fastfield::Result<MultiValueIntFastFieldReader<Item>> {
        let field_entry = self.schema.get_field_entry(field);
        if Item::fast_field_cardinality(field_entry.field_type()) == Some(Cardinality::MultiValues)
-            {
-                let idx_reader = self.fast_fields_composite
-                    .open_read_with_idx(field, 0)
-                    .ok_or_else(|| FastFieldNotAvailableError::new(field_entry))
-                    .map(FastFieldReader::open)?;
-                let vals_reader = self.fast_fields_composite
-                    .open_read_with_idx(field, 1)
-                    .ok_or_else(|| FastFieldNotAvailableError::new(field_entry))
-                    .map(FastFieldReader::open)?;
-                Ok(MultiValueIntFastFieldReader::open(idx_reader, vals_reader))
-            } else {
+        {
+            let idx_reader = self.fast_field_reader_with_idx(field, 0)?;
+            let vals_reader = self.fast_field_reader_with_idx(field, 1)?;
+            Ok(MultiValueIntFastFieldReader::open(idx_reader, vals_reader))
+        } else {
            Err(FastFieldNotAvailableError::new(field_entry))
        }
    }
@@ -170,15 +169,10 @@ impl SegmentReader {
    ///
    /// They are simply stored as a fast field, serialized in
    /// the `.fieldnorm` file of the segment.
-    pub fn get_fieldnorms_reader(&self, field: Field) -> FieldNormReader {
-        if let Some(fieldnorm_source) = self.fieldnorms_composite
-            .open_read(field) {
-            FieldNormReader::open(fieldnorm_source)
-        } else {
-            let field_name = self.schema.get_field_name(field);
-            let err_msg=  format!("Field norm not found for field {:?}. Was it market as indexed during indexing.", field_name);
-            panic!(err_msg);
-        }
+    pub fn get_fieldnorms_reader(&self, field: Field) -> Option<FastFieldReader<u64>> {
+        self.fieldnorms_composite
+            .open_read(field)
+            .map(FastFieldReader::open)
    }

    /// Accessor to the segment's `StoreReader`.
@@ -211,13 +205,12 @@ impl SegmentReader {
        let fieldnorms_data = segment.open_read(SegmentComponent::FIELDNORMS)?;
        let fieldnorms_composite = CompositeFile::open(&fieldnorms_data)?;

-        let delete_bitset_opt =
-            if segment.meta().has_deletes() {
-                let delete_data = segment.open_read(SegmentComponent::DELETE)?;
-                Some(DeleteBitSet::open(delete_data))
-            } else {
-                None
-            };
+        let delete_bitset = if segment.meta().has_deletes() {
+            let delete_data = segment.open_read(SegmentComponent::DELETE)?;
+            DeleteBitSet::open(delete_data)
+        } else {
+            DeleteBitSet::empty()
+        };

        let schema = segment.schema();
        Ok(SegmentReader {
@@ -229,7 +222,7 @@ impl SegmentReader {
            fieldnorms_composite,
            segment_id: segment.id(),
            store_reader,
-            delete_bitset_opt,
+            delete_bitset,
            positions_composite,
            schema,
        })
@@ -284,6 +277,7 @@ impl SegmentReader {
            TermDictionaryImpl::from_source(termdict_source),
            postings_source,
            positions_source,
+            self.delete_bitset.clone(),
            record_option,
        ));

@@ -312,16 +306,14 @@ impl SegmentReader {

    /// Returns the bitset representing
    /// the documents that have been deleted.
-    pub fn delete_bitset(&self) -> Option<&DeleteBitSet> {
-        self.delete_bitset_opt.as_ref()
+    pub fn delete_bitset(&self) -> &DeleteBitSet {
+        &self.delete_bitset
    }

    /// Returns true iff the `doc` is marked
    /// as deleted.
    pub fn is_deleted(&self, doc: DocId) -> bool {
-        self.delete_bitset()
-            .map(|delete_set| delete_set.is_deleted(doc))
-            .unwrap_or(false)
+        self.delete_bitset.is_deleted(doc)
    }
 }

--- a/src/directory/managed_directory.rs
+++ b/src/directory/managed_directory.rs
@@ -282,7 +282,6 @@ impl Clone for ManagedDirectory {
 mod tests {

    use super::*;
-    #[cfg(feature="mmap")]
    use directory::MmapDirectory;
    use std::path::Path;
    use std::io::Write;
@@ -294,7 +293,6 @@ mod tests {
    }

    #[test]
-    #[cfg(feature="mmap")]
    fn test_managed_directory() {
        let tempdir = TempDir::new("index").unwrap();
        let tempdir_path = PathBuf::from(tempdir.path());
@@ -343,7 +341,6 @@ mod tests {
    }

    #[test]
-    #[cfg(feature="mmap ")]
    fn test_managed_directory_gc_while_mmapped() {
        let tempdir = TempDir::new("index").unwrap();
        let tempdir_path = PathBuf::from(tempdir.path());
@@ -373,7 +370,6 @@ mod tests {
    }

    #[test]
-    #[cfg(feature="mmap")]
    fn test_managed_directory_protect() {
        let tempdir = TempDir::new("index").unwrap();
        let tempdir_path = PathBuf::from(tempdir.path());
--- a/src/directory/mod.rs
+++ b/src/directory/mod.rs
@@ -3,29 +3,21 @@
 WORM directory abstraction.

 */
-
-#[cfg(feature="mmap")]
 mod mmap_directory;
-
 mod ram_directory;
 mod directory;
 mod read_only_source;
 mod shared_vec_slice;
 mod managed_directory;
-mod static_directory;

 /// Errors specific to the directory module.
 pub mod error;

 use std::io::{BufWriter, Seek, Write};

-pub use self::static_directory::StaticDirectory;
-pub use self::static_directory::write_static_from_directory;
 pub use self::read_only_source::ReadOnlySource;
 pub use self::directory::Directory;
 pub use self::ram_directory::RAMDirectory;
-
-#[cfg(feature="mmap")]
 pub use self::mmap_directory::MmapDirectory;

 pub(crate) use self::read_only_source::SourceRead;
@@ -59,7 +51,6 @@ mod tests {
    }

    #[test]
-    #[cfg(feature="mmap")]
    fn test_mmap_directory() {
        let mut mmap_directory = MmapDirectory::create_from_tempdir().unwrap();
        test_directory(&mut mmap_directory);
--- a/src/directory/read_only_source.rs
+++ b/src/directory/read_only_source.rs
@@ -1,4 +1,3 @@
-#[cfg(feature="mmap")]
 use fst::raw::MmapReadOnly;
 use std::ops::Deref;
 use super::shared_vec_slice::SharedVecSlice;
@@ -7,8 +6,6 @@ use std::slice;
 use std::io::{self, Read};
 use stable_deref_trait::{CloneStableDeref, StableDeref};

-const EMPTY_SLICE: [u8; 0] = [];
-
 /// Read object that represents files in tantivy.
 ///
 /// These read objects are only in charge to deliver
@@ -17,12 +14,9 @@ const EMPTY_SLICE: [u8; 0] = [];
 /// hold by this object should never be altered or destroyed.
 pub enum ReadOnlySource {
    /// Mmap source of data
-    #[cfg(feature="mmap")]
    Mmap(MmapReadOnly),
    /// Wrapping a `Vec<u8>`
    Anonymous(SharedVecSlice),
-    /// Wrapping a static slice
-    Static(&'static [u8])
 }

 unsafe impl StableDeref for ReadOnlySource {}
@@ -39,16 +33,14 @@ impl Deref for ReadOnlySource {
 impl ReadOnlySource {
    /// Creates an empty ReadOnlySource
    pub fn empty() -> ReadOnlySource {
-        ReadOnlySource::Static(&EMPTY_SLICE)
+        ReadOnlySource::Anonymous(SharedVecSlice::empty())
    }

    /// Returns the data underlying the ReadOnlySource object.
    pub fn as_slice(&self) -> &[u8] {
        match *self {
-            #[cfg(feature="mmap")]
            ReadOnlySource::Mmap(ref mmap_read_only) => unsafe { mmap_read_only.as_slice() },
            ReadOnlySource::Anonymous(ref shared_vec) => shared_vec.as_slice(),
-            ReadOnlySource::Static(data) => data,
        }
    }

@@ -71,9 +63,7 @@ impl ReadOnlySource {
    /// 1KB slice is remaining, the whole `500MBs`
    /// are retained in memory.
    pub fn slice(&self, from_offset: usize, to_offset: usize) -> ReadOnlySource {
-        assert!(from_offset <= to_offset, "Requested negative slice [{}..{}]", from_offset, to_offset);
        match *self {
-            #[cfg(feature="mmap")]
            ReadOnlySource::Mmap(ref mmap_read_only) => {
                let sliced_mmap = mmap_read_only.range(from_offset, to_offset - from_offset);
                ReadOnlySource::Mmap(sliced_mmap)
@@ -81,9 +71,6 @@ impl ReadOnlySource {
            ReadOnlySource::Anonymous(ref shared_vec) => {
                ReadOnlySource::Anonymous(shared_vec.slice(from_offset, to_offset))
            }
-            ReadOnlySource::Static(data) => {
-                ReadOnlySource::Static(&data[from_offset..to_offset])
-            }
        }
    }

@@ -124,12 +111,6 @@ impl From<Vec<u8>> for ReadOnlySource {
    }
 }

-impl From<&'static [u8]> for ReadOnlySource {
-    fn from(data: &'static [u8]) -> ReadOnlySource {
-        ReadOnlySource::Static(data)
-    }
-}
-
 /// Acts as a owning cursor over the data backed up by a `ReadOnlySource`
 pub(crate) struct SourceRead {
    _data_owner: ReadOnlySource,
@@ -141,16 +122,6 @@ impl SourceRead {
    pub fn advance(&mut self, len: usize) {
        self.cursor = &self.cursor[len..];
    }
-
-    pub fn slice_from(&self, start: usize) -> &[u8] {
-        &self.cursor[start..]
-
-    }
-
-    pub fn get(&self, idx: usize) -> u8 {
-        self.cursor[idx]
-    }
-
 }

 impl AsRef<[u8]> for SourceRead {
--- a/src/directory/static_directory.rs
+++ b/src/directory/static_directory.rs
@@ -1,123 +0,0 @@
-use std::collections::HashMap;
-use Directory;
-use std::path::PathBuf;
-use directory::ReadOnlySource;
-use std::io::BufWriter;
-use directory::error::{DeleteError, OpenReadError, OpenWriteError};
-use std::path::Path;
-use std::fmt::{Formatter, Debug, self};
-use Result as TantivyResult;
-use directory::SeekableWrite;
-use std::io;
-use std::fs;
-use common::Endianness;
-use common::BinarySerializable;
-use common::VInt;
-use byteorder::ByteOrder;
-use std::str;
-use std::fs::File;
-use std::io::{Read, Write};
-use std::ffi::OsString;
-
-#[derive(Clone)]
-pub struct StaticDirectory {
-    files: HashMap<PathBuf, &'static [u8]>,
-}
-
-impl Debug for StaticDirectory {
-    fn fmt(&self, f: &mut Formatter) -> Result<(), fmt::Error> {
-        write!(f, "StaticDirectory[{} files]", self.files.len())?;
-        Ok(())
-    }
-}
-
-impl StaticDirectory {
-    pub fn open(mut data: &'static [u8]) -> TantivyResult<StaticDirectory> {
-        assert!(data.len() > 8);
-        let footer_len_offset = data.len() - 8;
-        let body_len = Endianness::read_u64(&data[footer_len_offset..]) as usize;
-        let mut body = &data[..body_len];
-        let mut footer = &data[body_len..footer_len_offset];
-        let num_files = VInt::deserialize(&mut footer)?.0 as usize;
-        let mut files = HashMap::new();
-        for _ in 0..num_files {
-            let filename_len = VInt::deserialize(&mut footer)?.0 as usize;
-            let filename = &footer[..filename_len];
-            footer = &footer[filename_len..];
-            let data_len = VInt::deserialize(&mut footer)?.0 as usize;
-            let file_data = &body[..data_len];
-            body = &body[data_len..];
-            let filename_str = str::from_utf8(filename).expect("Invalid UTF8");
-            let filename = PathBuf::from(filename_str);
-            println!("{:?} {:?}", filename, data_len);
-            files.insert(filename, file_data);
-        }
-        Ok(StaticDirectory {
-            files
-        })
-    }
-}
-
-impl Directory for StaticDirectory {
-    fn open_read(&self, path: &Path) -> Result<ReadOnlySource, OpenReadError> {
-        if let Some(static_data) = self.files.get(path) {
-            Ok(ReadOnlySource::from(*static_data))
-        } else {
-            Err(OpenReadError::FileDoesNotExist(path.to_owned()))
-        }
-    }
-
-    fn delete(&self, path: &Path) -> Result<(), DeleteError> {
-        unimplemented!("Static directory is read-only !")
-    }
-
-    fn exists(&self, path: &Path) -> bool {
-        self.files.contains_key(path)
-    }
-
-    fn open_write(&mut self, path: &Path) -> Result<BufWriter<Box<SeekableWrite>>, OpenWriteError> {
-        unimplemented!("Static directory is read-only !")
-    }
-
-    fn atomic_read(&self, path: &Path) -> Result<Vec<u8>, OpenReadError> {
-        if let Some(static_data) = self.files.get(path) {
-            Ok(static_data.to_vec())
-        } else {
-            Err(OpenReadError::FileDoesNotExist(path.to_owned()))
-        }
-    }
-
-    fn atomic_write(&mut self, path: &Path, data: &[u8]) -> io::Result<()> {
-        unimplemented!("Static directory is read-only !")
-    }
-
-    fn box_clone(&self) -> Box<Directory> {
-        box self.clone()
-    }
-}
-
-pub fn write_static_from_directory(directory_path: &Path) -> TantivyResult<Vec<u8>> {
-    assert!(directory_path.is_dir());
-    let mut file_data: Vec<(OsString, usize)> = Vec::new();
-    let mut write: Vec<u8> = Vec::new();
-    for entry in fs::read_dir(directory_path)? {
-        let entry = entry?;
-        let path = entry.path();
-        if path.is_file() {
-            info!("Appending {}", path.to_string_lossy());
-            let mut open_file = File::open(&path)?;
-            let file_len = open_file.read_to_end(&mut write)?;
-            file_data.push((entry.file_name(), file_len));
-        }
-    }
-    // write footer
-    let body_len = write.len();
-    VInt(file_data.len() as u64).serialize(&mut write)?;
-    for (filename, filelen) in file_data {
-        VInt(filename.len() as u64).serialize(&mut write)?;
-        write.write_all(filename.to_string_lossy().as_bytes())?;
-        VInt(filelen as u64).serialize(&mut write)?;
-    }
-    (body_len as u64).serialize(&mut write)?;
-    Ok(write)
-}
--- a/src/fastfield/delete.rs
+++ b/src/fastfield/delete.rs
@@ -51,7 +51,21 @@ impl DeleteBitSet {
        }
    }

-    /// Returns whether the document has been marked as deleted.
+    /// Returns an empty delete bit set.
+    pub fn empty() -> DeleteBitSet {
+        DeleteBitSet {
+            data: ReadOnlySource::empty(),
+            len: 0,
+        }
+    }
+
+    /// Returns true iff the segment has some deleted documents.
+    pub fn has_deletes(&self) -> bool {
+        self.len() > 0
+    }
+
+    /// Returns true iff the document is deleted.
+    #[inline]
    pub fn is_deleted(&self, doc: DocId) -> bool {
        if self.len == 0 {
            false
@@ -62,10 +76,8 @@ impl DeleteBitSet {
            b & (1u8 << shift) != 0
        }
    }
-
 }

-
 impl HasLen for DeleteBitSet {
    fn len(&self) -> usize {
        self.len
--- a/src/fastfield/multivalued/reader.rs
+++ b/src/fastfield/multivalued/reader.rs
@@ -26,13 +26,31 @@ impl<Item: FastValue> MultiValueIntFastFieldReader<Item> {
        }
    }

+    /// Returns `(start, stop)`, such that the values associated
+    /// to the given document are `start..stop`.
+    fn range(&self, doc: DocId) -> (u64, u64) {
+        let start = self.idx_reader.get(doc);
+        let stop = self.idx_reader.get(doc + 1);
+        (start, stop)
+    }
+
+    /// Returns the number of values associated to a given document.
+    pub fn num_vals(&self, doc: DocId) -> usize {
+        let (start, stop) = self.range(doc);
+        (stop - start) as usize
+    }
+
+    /// Returns the overall number of values associated to documents.
+    pub(crate) fn total_num_vals(&self) -> u64 {
+        self.idx_reader.max_value()
+    }
+
    /// Returns the array of values associated to the given `doc`.
    pub fn get_vals(&self, doc: DocId, vals: &mut Vec<Item>) {
-        let start = self.idx_reader.get(doc) as u32;
-        let stop = self.idx_reader.get(doc + 1) as u32;
+        let (start, stop) = self.range(doc);
        let len = (stop - start) as usize;
        vals.resize(len, Item::default());
-        self.vals_reader.get_range(start, &mut vals[..]);
+        self.vals_reader.get_range(start as u32, &mut vals[..]);
    }
 }

--- a/src/fastfield/multivalued/writer.rs
+++ b/src/fastfield/multivalued/writer.rs
@@ -6,6 +6,7 @@ use postings::UnorderedTermId;
 use schema::{Document, Field};
 use std::io;
 use itertools::Itertools;
+use termdict::TermOrdinal;

 pub struct MultiValueIntFastFieldWriter {
    field: Field,
@@ -66,7 +67,7 @@ impl MultiValueIntFastFieldWriter {
    pub fn serialize(
        &self,
        serializer: &mut FastFieldSerializer,
-        mapping_opt: Option<&HashMap<UnorderedTermId, usize>>,
+        mapping_opt: Option<&HashMap<UnorderedTermId, TermOrdinal>>,
    ) -> io::Result<()> {
        {
            // writing the offset index
@@ -90,13 +91,13 @@ impl MultiValueIntFastFieldWriter {
                        1,
                    )?;
                    for val in &self.vals {
-                        let remapped_val = *mapping.get(val).expect("Missing term ordinal") as u64;
+                        let remapped_val = *mapping.get(val).expect("Missing term ordinal");
                        value_serializer.add_val(remapped_val)?;
                    }
                }
                None => {
                    let val_min_max = self.vals.iter().cloned().minmax();
-                    let (val_min, val_max) = val_min_max.into_option().unwrap_or((0u64, 0));
+                    let (val_min, val_max) = val_min_max.into_option().unwrap_or((0u64, 0u64));
                    value_serializer =
                        serializer.new_u64_fast_field_with_idx(self.field, val_min, val_max, 1)?;
                    for &val in &self.vals {
--- a/src/fastfield/reader.rs
+++ b/src/fastfield/reader.rs
@@ -71,6 +71,9 @@ impl<Item: FastValue> FastFieldReader<Item> {
    ///
    /// May panic if `start + output.len()` is greater than
    /// the segment's `maxdoc`.
+    ///
+    // TODO change start to `u64`.
+    // For multifastfield, start is an index in a second fastfield, not a `DocId`
    pub fn get_range(&self, start: u32, output: &mut [Item]) {
        let output_u64: &mut [u64] = unsafe { mem::transmute(output) };
        self.bit_unpacker.get_range(start, output_u64);
--- a/src/fastfield/serializer.rs
+++ b/src/fastfield/serializer.rs
@@ -77,11 +77,21 @@ pub struct FastSingleFieldSerializer<'a, W: Write + 'a> {
 }

 impl<'a, W: Write> FastSingleFieldSerializer<'a, W> {
+
+    /// Creates a new fast field serializer.
+    ///
+    /// The serializer in fact encode the values by bitpacking
+    /// `(val - min_value)`.
+    ///
+    /// It requires a `min_value` and a `max_value` to compute
+    /// compute the minimum number of bits required to encode
+    /// values.
    fn open(
        write: &'a mut W,
        min_value: u64,
        max_value: u64,
    ) -> io::Result<FastSingleFieldSerializer<'a, W>> {
+        assert!(min_value <= max_value);
        min_value.serialize(write)?;
        let amplitude = max_value - min_value;
        amplitude.serialize(write)?;
--- a/src/fastfield/writer.rs
+++ b/src/fastfield/writer.rs
@@ -1,6 +1,7 @@
 use schema::{Cardinality, Document, Field, Schema};
 use fastfield::FastFieldSerializer;
 use std::io;
+use DocId;
 use schema::FieldType;
 use common;
 use common::VInt;
@@ -8,6 +9,7 @@ use std::collections::HashMap;
 use postings::UnorderedTermId;
 use super::multivalued::MultiValueIntFastFieldWriter;
 use common::BinarySerializable;
+use termdict::TermOrdinal;

 /// The fastfieldswriter regroup all of the fast field writers.
 pub struct FastFieldsWriter {
@@ -56,6 +58,15 @@ impl FastFieldsWriter {
        }
    }

+    /// Returns a `FastFieldsWriter with a `u64` `IntFastFieldWriter` for each
+    /// of the field given in argument.
+    pub(crate) fn new(fields: Vec<Field>) -> FastFieldsWriter {
+        FastFieldsWriter {
+            single_value_writers: fields.into_iter().map(IntFastFieldWriter::new).collect(),
+            multi_values_writers: vec![],
+        }
+    }
+
    /// Get the `FastFieldWriter` associated to a field.
    pub fn get_field_writer(&mut self, field: Field) -> Option<&mut IntFastFieldWriter> {
        // TODO optimize
@@ -95,7 +106,7 @@ impl FastFieldsWriter {
    pub fn serialize(
        &self,
        serializer: &mut FastFieldSerializer,
-        mapping: &HashMap<Field, HashMap<UnorderedTermId, usize>>,
+        mapping: &HashMap<Field, HashMap<UnorderedTermId, TermOrdinal>>,
    ) -> io::Result<()> {
        for field_writer in &self.single_value_writers {
            field_writer.serialize(serializer)?;
@@ -106,6 +117,16 @@ impl FastFieldsWriter {
        }
        Ok(())
    }
+
+    /// Ensures all of the fast field writers have
+    /// reached `doc`. (included)
+    ///
+    /// The missing values will be filled with 0.
+    pub fn fill_val_up_to(&mut self, doc: DocId) {
+        for field_writer in &mut self.single_value_writers {
+            field_writer.fill_val_up_to(doc);
+        }
+    }
 }

 /// Fast field writer for ints.
@@ -158,6 +179,19 @@ impl IntFastFieldWriter {
        self.val_if_missing = val_if_missing;
    }

+    /// Ensures all of the fast field writer have
+    /// reached `doc`. (included)
+    ///
+    /// The missing values will be filled with 0.
+    fn fill_val_up_to(&mut self, doc: DocId) {
+        let target = doc as usize + 1;
+        debug_assert!(self.val_count <= target);
+        let val_if_missing = self.val_if_missing;
+        while self.val_count < target {
+            self.add_val(val_if_missing);
+        }
+    }
+
    /// Records a new value.
    ///
    /// The n-th value being recorded is implicitely
--- a/src/fieldnorm/code.rs
+++ b/src/fieldnorm/code.rs
@@ -1,106 +0,0 @@
-
-#[inline(always)]
-pub fn id_to_fieldnorm(id: u8) -> u32 {
-    FIELD_NORMS_TABLE[id as usize]
-}
-
-
-#[inline(always)]
-pub fn fieldnorm_to_id(fieldnorm: u32) -> u8 {
-    FIELD_NORMS_TABLE
-        .binary_search(&fieldnorm)
-        .unwrap_or_else(|idx| idx - 1) as u8
-}
-
-
-pub const FIELD_NORMS_TABLE: [u32; 256] = [
-    0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
-    16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
-    32, 33, 34, 35, 36, 37, 38, 39, 40, 42, 44, 46, 48, 50, 52, 54,
-    56, 60, 64, 68, 72, 76, 80, 84, 88, 96, 104, 112, 120, 128, 136, 144,
-    152, 168, 184, 200, 216, 232, 248, 264, 280, 312, 344, 376, 408, 440, 472, 504,
-    536, 600, 664, 728, 792, 856, 920, 984,
-    1048, 1176, 1304, 1432, 1560, 1688, 1816, 1944,
-    2072, 2328, 2584, 2840, 3096, 3352, 3608, 3864, 4120,
-    4632, 5144, 5656, 6168, 6680, 7192, 7704, 8216, 9240,
-    10264, 11288, 12312, 13336, 14360, 15384,
-    16408, 18456, 20504, 22552, 24600, 26648, 28696, 30744,
-    32792, 36888, 40984, 45080, 49176, 53272, 57368, 61464,
-    65560, 73752, 81944, 90136, 98328, 106520, 114712, 122904, 131096, 147480,
-    163864,  180248,  196632,  213016,  229400,  245784,  262168,
-    294936,  327704,  360472,  393240,  426008,  458776,
-    491544, 524312, 589848, 655384, 720920, 786456, 851992, 917528,
-    983064, 1048600, 1179672, 1310744, 1441816, 1572888, 1703960, 1835032,
-    1966104, 2097176, 2359320, 2621464, 2883608, 3145752, 3407896, 3670040, 3932184,
-    4194328, 4718616, 5242904, 5767192, 6291480, 6815768, 7340056, 7864344, 8388632, 9437208,
-    10485784, 11534360, 12582936, 13631512, 14680088, 15728664, 16777240, 18874392, 20971544,
-    23068696, 25165848, 27263000, 29360152, 31457304, 33554456, 37748760, 41943064,
-    46137368, 50331672, 54525976, 58720280, 62914584, 67108888, 75497496, 83886104,
-    92274712, 100663320, 109051928, 117440536, 125829144, 134217752, 150994968, 167772184,
-    184549400, 201326616, 218103832, 234881048, 251658264, 268435480, 301989912, 335544344,
-    369098776, 402653208, 436207640, 469762072, 503316504, 536870936, 603979800, 671088664,
-    738197528, 805306392, 872415256, 939524120, 1006632984, 1073741848, 1207959576, 1342177304,
-    1476395032, 1610612760, 1744830488, 1879048216, 2013265944
-];
-
-
-
-#[cfg(test)]
-mod tests {
-
-    use super::{fieldnorm_to_id, id_to_fieldnorm, FIELD_NORMS_TABLE};
-
-
-    #[test]
-    fn test_decode_code() {
-        assert_eq!(fieldnorm_to_id(0), 0);
-        assert_eq!(fieldnorm_to_id(1), 1);
-        for i in 0..41 {
-            assert_eq!(fieldnorm_to_id(i), i as u8);
-        }
-        assert_eq!(fieldnorm_to_id(41), 40);
-        assert_eq!(fieldnorm_to_id(42), 41);
-        for id in 43..256 {
-            let field_norm = FIELD_NORMS_TABLE[id];
-            assert_eq!(id_to_fieldnorm(id as u8), field_norm);
-            assert_eq!(fieldnorm_to_id(field_norm), id as u8);
-            assert_eq!(fieldnorm_to_id(field_norm - 1), id as u8 - 1);
-            assert_eq!(fieldnorm_to_id(field_norm + 1), id as u8);
-        }
-    }
-
-    #[test]
-    fn test_u32_max() {
-        assert_eq!(fieldnorm_to_id(u32::max_value()), u8::max_value());
-    }
-
-    #[test]
-    fn test_fieldnorm_byte() {
-        // const expression are not really a thing
-        // yet... Therefore we do things the other way around.
-
-        // The array is defined as a const,
-        // and we check in the unit test that the const
-        // value is matching the logic.
-        const IDENTITY_PART: u8 = 24u8;
-        fn decode_field_norm_exp_part(b: u8) -> u32 {
-            let bits = (b & 0b00000111) as u32;
-            let shift = b >> 3;
-            if shift == 0 {
-                bits
-            } else {
-                (bits | 8u32) << ((shift - 1u8) as u32)
-            }
-        }
-        fn decode_fieldnorm_byte(b: u8) -> u32 {
-            if b < IDENTITY_PART {
-                b as u32
-            } else {
-                (IDENTITY_PART as u32) + decode_field_norm_exp_part(b - IDENTITY_PART)
-            }
-        }
-        for i in 0..256 {
-            assert_eq!(FIELD_NORMS_TABLE[i], decode_fieldnorm_byte(i as u8));
-        }
-    }
-}
--- a/src/fieldnorm/mod.rs
+++ b/src/fieldnorm/mod.rs
@@ -1,29 +0,0 @@
-//! The fieldnorm represents the length associated to
-//! a given Field of a given document.
-//!
-//! This metric is important to compute the score of a
-//! document : a document having a query word in one its short fields
-//! (e.g. title)  is likely to be more relevant than in one of its longer field
-//! (e.g. body).
-//!
-//! It encodes `fieldnorm` on one byte with some precision loss,
-//! using the exact same scheme as Lucene. Each value is place on a log-scale
-//! that takes values from `0` to `255`.
-//!
-//! A value on this scale is identified by a `fieldnorm_id`.
-//! Apart from compression, this scale also makes it possible to
-//! precompute computationally expensive functions of the fieldnorm
-//! in a very short array.
-//!
-//! This trick is used by the [BM25 similarity]().
-mod code;
-mod serializer;
-mod writer;
-mod reader;
-
-pub use self::reader::FieldNormReader;
-pub use self::writer::FieldNormsWriter;
-pub use self::serializer::FieldNormsSerializer;
-
-use self::code::{fieldnorm_to_id, id_to_fieldnorm};
-
--- a/src/fieldnorm/reader.rs
+++ b/src/fieldnorm/reader.rs
@@ -1,82 +0,0 @@
-use super::{id_to_fieldnorm, fieldnorm_to_id};
-use directory::ReadOnlySource;
-use DocId;
-
-
-/// Reads the fieldnorm associated to a document.
-/// The fieldnorm represents the length associated to
-/// a given Field of a given document.
-///
-/// This metric is important to compute the score of a
-/// document : a document having a query word in one its short fields
-/// (e.g. title)  is likely to be more relevant than in one of its longer field
-/// (e.g. body).
-///
-/// tantivy encodes `fieldnorm` on one byte with some precision loss,
-/// using the same scheme as Lucene. Each value is place on a log-scale
-/// that takes values from `0` to `255`.
-///
-/// A value on this scale is identified by a `fieldnorm_id`.
-/// Apart from compression, this scale also makes it possible to
-/// precompute computationally expensive functions of the fieldnorm
-/// in a very short array.
-pub struct FieldNormReader {
-    data: ReadOnlySource
-}
-
-impl FieldNormReader {
-
-    /// Opens a field norm reader given its data source.
-    pub fn open(data: ReadOnlySource) -> Self {
-        FieldNormReader {
-            data
-        }
-    }
-
-    /// Returns the `fieldnorm` associated to a doc id.
-    /// The fieldnorm is a value approximating the number
-    /// of tokens in a given field of the `doc_id`.
-    ///
-    /// It is imprecise, and always lower than the actual
-    /// number of tokens.
-    ///
-    /// The fieldnorm is effectively decoded from the
-    /// `fieldnorm_id` by doing a simple table lookup.
-    pub fn fieldnorm(&self, doc_id: DocId) -> u32 {
-        let fieldnorm_id = self.fieldnorm_id(doc_id);
-        id_to_fieldnorm(fieldnorm_id)
-    }
-
-    /// Returns the `fieldnorm_id` associated to a document.
-    #[inline(always)]
-    pub fn fieldnorm_id(&self, doc_id: DocId) -> u8 {
-        let fielnorms_data = self.data.as_slice();
-        fielnorms_data[doc_id as usize]
-    }
-
-    /// Converts a `fieldnorm_id` into a fieldnorm.
-    #[inline(always)]
-    pub fn id_to_fieldnorm(id: u8) -> u32 {
-        id_to_fieldnorm(id)
-    }
-
-    /// Converts a `fieldnorm` into a `fieldnorm_id`.
-    /// (This function is not injective).
-    #[inline(always)]
-    pub fn fieldnorm_to_id(fieldnorm: u32) -> u8 {
-        fieldnorm_to_id(fieldnorm)
-    }
-}
-
-#[cfg(test)]
-impl From<Vec<u32>> for FieldNormReader {
-    fn from(field_norms: Vec<u32>) -> FieldNormReader {
-        let field_norms_id = field_norms.into_iter()
-            .map(FieldNormReader::fieldnorm_to_id)
-            .collect::<Vec<u8>>();
-        let field_norms_data = ReadOnlySource::from(field_norms_id);
-        FieldNormReader {
-            data: field_norms_data
-        }
-    }
-}
--- a/src/fieldnorm/serializer.rs
+++ b/src/fieldnorm/serializer.rs
@@ -1,37 +0,0 @@
-use directory::WritePtr;
-use std::io;
-use common::CompositeWrite;
-use schema::Field;
-use std::io::Write;
-
-
-pub struct FieldNormsSerializer {
-    composite_write: CompositeWrite,
-}
-
-impl FieldNormsSerializer {
-
-    /// Constructor
-    pub fn from_write(write: WritePtr) -> io::Result<FieldNormsSerializer> {
-        // just making room for the pointer to header.
-        let composite_write = CompositeWrite::wrap(write);
-        Ok(FieldNormsSerializer {
-            composite_write
-        })
-    }
-
-
-    pub fn serialize_field(&mut self, field: Field, fieldnorms_data: &[u8]) -> io::Result<()> {
-        let write = self.composite_write.for_field(field);
-        write.write_all(fieldnorms_data)?;
-        write.flush()?;
-        Ok(())
-    }
-
-    pub fn close(self) -> io::Result<()> {
-        self.composite_write.close()?;
-        Ok(())
-    }
-
-}
-
--- a/src/fieldnorm/writer.rs
+++ b/src/fieldnorm/writer.rs
@@ -1,65 +0,0 @@
-use DocId;
-
-use schema::Field;
-use super::FieldNormsSerializer;
-use std::io;
-use schema::Schema;
-use super::fieldnorm_to_id;
-
-pub struct FieldNormsWriter {
-    fields: Vec<Field>,
-    fieldnorms_buffer: Vec<Vec<u8>>
-}
-
-impl FieldNormsWriter {
-
-    pub fn fields_with_fieldnorm(schema: &Schema) -> Vec<Field> {
-        schema
-            .fields()
-            .iter()
-            .enumerate()
-            .filter(|&(_, field_entry)| {
-                field_entry.is_indexed()
-            })
-            .map(|(field, _)| Field(field as u32))
-            .collect::<Vec<Field>>()
-    }
-
-    pub fn for_schema(schema: &Schema) -> FieldNormsWriter {
-        let fields = FieldNormsWriter::fields_with_fieldnorm(schema);
-        let max_field = fields
-            .iter()
-            .map(|field| field.0)
-            .max()
-            .map(|max_field_id| max_field_id as usize + 1)
-            .unwrap_or(0);
-        FieldNormsWriter {
-            fields,
-            fieldnorms_buffer: (0..max_field)
-                .map(|_| Vec::new())
-                .collect::<Vec<_>>()
-        }
-    }
-
-    pub fn fill_up_to_max_doc(&mut self, max_doc: DocId) {
-        for &field in self.fields.iter() {
-            self.fieldnorms_buffer[field.0 as usize].resize(max_doc as usize, 0u8);
-        }
-    }
-
-    pub fn record(&mut self, doc: DocId, field: Field, fieldnorm: u32) {
-        let fieldnorm_buffer: &mut Vec<u8> = &mut self.fieldnorms_buffer[field.0 as usize];
-        assert!(fieldnorm_buffer.len() <= doc as usize, "Cannot register a given fieldnorm twice");
-        // we fill intermediary `DocId` as  having a fieldnorm of 0.
-        fieldnorm_buffer.resize(doc as usize + 1, 0u8);
-        fieldnorm_buffer[doc as usize] = fieldnorm_to_id(fieldnorm);
-    }
-
-    pub fn serialize(&self, fieldnorms_serializer: &mut FieldNormsSerializer) -> io::Result<()> {
-        for &field in self.fields.iter() {
-            let fieldnorm_values: &[u8] = &self.fieldnorms_buffer[field.0 as usize][..];
-            fieldnorms_serializer.serialize_field(field, fieldnorm_values)?;
-        }
-        Ok(())
-    }
-}
--- a/src/functional_test.rs
+++ b/src/functional_test.rs
@@ -13,7 +13,6 @@ fn check_index_content(searcher: &Searcher, vals: &HashSet<u64>) {

 #[test]
 #[ignore]
-#[cfg(feature="mmap")]
 fn test_indexing() {
    let mut schema_builder = SchemaBuilder::default();

--- a/src/indexer/index_writer.rs
+++ b/src/indexer/index_writer.rs
@@ -200,7 +200,6 @@ pub fn advance_deletes(
    target_opstamp: u64,
 ) -> Result<Option<FileProtection>> {
    let mut file_protect: Option<FileProtection> = None;
-
    {
        if let Some(previous_opstamp) = segment_entry.meta().delete_opstamp() {
            // We are already up-to-date here.
@@ -241,7 +240,6 @@ pub fn advance_deletes(
        }
    }
    segment_entry.set_meta(segment.meta().clone());
-
    Ok(file_protect)
 }

--- a/src/indexer/merger.rs
+++ b/src/indexer/merger.rs
--- a/src/indexer/mod.rs
+++ b/src/indexer/mod.rs
@@ -1,7 +1,7 @@
 pub mod index_writer;
 pub mod segment_serializer;
 pub mod merger;
-pub mod merge_policy;
+mod merge_policy;
 mod log_merge_policy;
 mod segment_register;
 mod segment_writer;
--- a/src/indexer/segment_serializer.rs
+++ b/src/indexer/segment_serializer.rs
@@ -4,7 +4,6 @@ use core::Segment;
 use core::SegmentComponent;
 use fastfield::FastFieldSerializer;
 use store::StoreWriter;
-use fieldnorm::FieldNormsSerializer;
 use postings::InvertedIndexSerializer;

 /// Segment serializer is in charge of laying out on disk
@@ -12,7 +11,7 @@ use postings::InvertedIndexSerializer;
 pub struct SegmentSerializer {
    store_writer: StoreWriter,
    fast_field_serializer: FastFieldSerializer,
-    fieldnorms_serializer: FieldNormsSerializer,
+    fieldnorms_serializer: FastFieldSerializer,
    postings_serializer: InvertedIndexSerializer,
 }

@@ -25,14 +24,14 @@ impl SegmentSerializer {
        let fast_field_serializer = FastFieldSerializer::from_write(fast_field_write)?;

        let fieldnorms_write = segment.open_write(SegmentComponent::FIELDNORMS)?;
-        let fieldnorms_serializer = FieldNormsSerializer::from_write(fieldnorms_write)?;
+        let fieldnorms_serializer = FastFieldSerializer::from_write(fieldnorms_write)?;

        let postings_serializer = InvertedIndexSerializer::open(segment)?;
        Ok(SegmentSerializer {
+            postings_serializer,
            store_writer: StoreWriter::new(store_write),
            fast_field_serializer,
            fieldnorms_serializer,
-            postings_serializer,
        })
    }

@@ -47,7 +46,7 @@ impl SegmentSerializer {
    }

    /// Accessor to the field norm serializer.
-    pub fn  get_fieldnorms_serializer(&mut self) -> &mut FieldNormsSerializer {
+    pub fn get_fieldnorms_serializer(&mut self) -> &mut FastFieldSerializer {
        &mut self.fieldnorms_serializer
    }

--- a/src/indexer/segment_writer.rs
+++ b/src/indexer/segment_writer.rs
@@ -7,8 +7,10 @@ use schema::Term;
 use core::Segment;
 use core::SerializableSegment;
 use fastfield::FastFieldsWriter;
+use schema::Field;
 use schema::FieldType;
 use indexer::segment_serializer::SegmentSerializer;
+use std::collections::HashMap;
 use datastruct::stacker::Heap;
 use indexer::index_writer::MARGIN_IN_BYTES;
 use super::operation::AddOperation;
@@ -17,7 +19,6 @@ use tokenizer::BoxedTokenizer;
 use tokenizer::FacetTokenizer;
 use tokenizer::{TokenStream, Tokenizer};
 use schema::Value;
-use fieldnorm::FieldNormsWriter;

 /// A `SegmentWriter` is in charge of creating segment index from a
 /// documents.
@@ -30,11 +31,21 @@ pub struct SegmentWriter<'a> {
    multifield_postings: MultiFieldPostingsWriter<'a>,
    segment_serializer: SegmentSerializer,
    fast_field_writers: FastFieldsWriter,
-    fieldnorms_writer: FieldNormsWriter,
+    fieldnorms_writer: FastFieldsWriter,
    doc_opstamps: Vec<u64>,
    tokenizers: Vec<Option<Box<BoxedTokenizer>>>,
 }

+fn create_fieldnorms_writer(schema: &Schema) -> FastFieldsWriter {
+    let u64_fields: Vec<Field> = schema
+        .fields()
+        .iter()
+        .enumerate()
+        .filter(|&(_, field_entry)| field_entry.is_indexed())
+        .map(|(field_id, _)| Field(field_id as u32))
+        .collect();
+    FastFieldsWriter::new(u64_fields)
+}

 impl<'a> SegmentWriter<'a> {
    /// Creates a new `SegmentWriter`
@@ -72,7 +83,7 @@ impl<'a> SegmentWriter<'a> {
            heap,
            max_doc: 0,
            multifield_postings,
-            fieldnorms_writer: FieldNormsWriter::for_schema(schema),
+            fieldnorms_writer: create_fieldnorms_writer(schema),
            segment_serializer,
            fast_field_writers: FastFieldsWriter::from_schema(schema),
            doc_opstamps: Vec::with_capacity(1_000),
@@ -84,8 +95,7 @@ impl<'a> SegmentWriter<'a> {
    ///
    /// Finalize consumes the `SegmentWriter`, so that it cannot
    /// be used afterwards.
-    pub fn finalize(mut self) -> Result<Vec<u64>> {
-        self.fieldnorms_writer.fill_up_to_max_doc(self.max_doc);
+    pub fn finalize(self) -> Result<Vec<u64>> {
        write(
            &self.multifield_postings,
            &self.fast_field_writers,
@@ -180,7 +190,10 @@ impl<'a> SegmentWriter<'a> {
                        0
                    };
                    self.fieldnorms_writer
-                        .record(doc_id, field, num_tokens);
+                        .get_field_writer(field)
+                        .map(|field_norms_writer| {
+                            field_norms_writer.add_val(u64::from(num_tokens))
+                        });
                }
                FieldType::U64(ref int_option) => {
                    if int_option.is_indexed() {
@@ -206,6 +219,7 @@ impl<'a> SegmentWriter<'a> {
                }
            }
        }
+        self.fieldnorms_writer.fill_val_up_to(doc_id);
        doc.filter_fields(|field| schema.get_field_entry(field).is_stored());
        let doc_writer = self.segment_serializer.get_store_writer();
        doc_writer.store(&doc)?;
@@ -238,13 +252,14 @@ impl<'a> SegmentWriter<'a> {
 fn write(
    multifield_postings: &MultiFieldPostingsWriter,
    fast_field_writers: &FastFieldsWriter,
-    fieldnorms_writer: &FieldNormsWriter,
+    fieldnorms_writer: &FastFieldsWriter,
    mut serializer: SegmentSerializer,
 ) -> Result<()> {
    let term_ord_map = multifield_postings.serialize(serializer.get_postings_serializer())?;
    fast_field_writers.serialize(serializer.get_fast_field_serializer(), &term_ord_map)?;
-    fieldnorms_writer.serialize(serializer.get_fieldnorms_serializer())?;
+    fieldnorms_writer.serialize(serializer.get_fieldnorms_serializer(), &HashMap::new())?;
    serializer.close()?;
+
    Ok(())
 }

--- a/src/indexer/stamper.rs
+++ b/src/indexer/stamper.rs
@@ -1,15 +1,15 @@
-use std::sync::atomic::{AtomicUsize, Ordering};
+use std::sync::atomic::{AtomicU64, Ordering};
 use std::sync::Arc;

 #[derive(Clone, Default)]
-pub struct Stamper(Arc<AtomicUsize>);
+pub struct Stamper(Arc<AtomicU64>);

 impl Stamper {
    pub fn new(first_opstamp: u64) -> Stamper {
-        Stamper(Arc::new(AtomicUsize::new(first_opstamp as usize)))
+        Stamper(Arc::new(AtomicU64::new(first_opstamp)))
    }

    pub fn stamp(&self) -> u64 {
-        self.0.fetch_add(1, Ordering::SeqCst) as u64
+        self.0.fetch_add(1u64, Ordering::SeqCst)
    }
 }
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -129,7 +129,6 @@ extern crate log;
 #[macro_use]
 extern crate error_chain;

-#[cfg(feature="mmap")]
 extern crate atomicwrites;
 extern crate bit_set;
 extern crate byteorder;
@@ -140,7 +139,7 @@ extern crate fst;
 extern crate futures;
 extern crate futures_cpupool;
 extern crate itertools;
-extern crate snap;
+extern crate lz4;
 extern crate num_cpus;
 extern crate owning_ref;
 extern crate regex;
@@ -149,10 +148,9 @@ extern crate serde;
 extern crate serde_json;
 extern crate stable_deref_trait;
 extern crate tempdir;
-#[cfg(test)]
 extern crate tempfile;
+extern crate time;
 extern crate uuid;
-extern crate bitpacking;

 #[cfg(test)]
 #[macro_use]
@@ -161,6 +159,9 @@ extern crate matches;
 #[cfg(test)]
 extern crate env_logger;

+#[cfg(feature = "simdcompression")]
+extern crate libc;
+
 #[cfg(windows)]
 extern crate winapi;

@@ -203,7 +204,6 @@ pub mod collector;
 pub mod postings;
 pub mod schema;
 pub mod fastfield;
-pub mod fieldnorm;

 mod docset;
 pub use self::docset::{DocSet, SkipResult};
@@ -213,6 +213,8 @@ pub use core::{Index, Searcher, Segment, SegmentId, SegmentMeta};
 pub use indexer::IndexWriter;
 pub use schema::{Document, Term};
 pub use core::{InvertedIndexReader, SegmentReader};
+pub use self::common::TimerTree;
+
 pub use postings::Postings;
 pub use core::SegmentComponent;

@@ -292,14 +294,6 @@ mod tests {
    use rand::{Rng, SeedableRng, XorShiftRng};
    use rand::distributions::{IndependentSample, Range};

-    pub fn assert_nearly_equals(expected: f32, val: f32) {
-        assert!(nearly_equals(val, expected), "Got {}, expected {}.", val, expected);
-    }
-
-    pub fn nearly_equals(a: f32, b: f32) -> bool {
-        (a - b).abs() < 0.0005 * (a + b).abs()
-    }
-
    fn generate_array_with_seed(n: usize, ratio: f32, seed_val: u32) -> Vec<u32> {
        let seed: &[u32; 4] = &[1, 2, 3, seed_val];
        let mut rng: XorShiftRng = XorShiftRng::from_seed(*seed);
@@ -333,7 +327,6 @@ mod tests {
    }

    #[test]
-    #[cfg(feature="mmap")]
    fn test_indexing() {
        let mut schema_builder = SchemaBuilder::default();
        let text_field = schema_builder.add_text_field("text", TEXT);
@@ -398,35 +391,6 @@ mod tests {
        }
    }

-    #[test]
-    fn test_fieldnorm_no_docs_with_field() {
-        let mut schema_builder = SchemaBuilder::default();
-        let title_field = schema_builder.add_text_field("title", TEXT);
-        let text_field = schema_builder.add_text_field("text", TEXT);
-        let index = Index::create_in_ram(schema_builder.build());
-        {
-            let mut index_writer = index.writer_with_num_threads(1, 40_000_000).unwrap();
-            {
-                let doc = doc!(text_field=>"a b c");
-                index_writer.add_document(doc);
-            }
-            index_writer.commit().unwrap();
-        }
-        {
-            index.load_searchers().unwrap();
-            let searcher = index.searcher();
-            let reader = searcher.segment_reader(0);
-            {
-                let fieldnorm_reader = reader.get_fieldnorms_reader(text_field);
-                assert_eq!(fieldnorm_reader.fieldnorm(0), 3);
-            }
-            {
-                let fieldnorm_reader = reader.get_fieldnorms_reader(title_field);
-                assert_eq!(fieldnorm_reader.fieldnorm_id(0), 0);
-            }
-        }
-    }
-
    #[test]
    fn test_fieldnorm() {
        let mut schema_builder = SchemaBuilder::default();
@@ -452,23 +416,13 @@ mod tests {
            index.load_searchers().unwrap();
            let searcher = index.searcher();
            let segment_reader: &SegmentReader = searcher.segment_reader(0);
-            let fieldnorms_reader = segment_reader.get_fieldnorms_reader(text_field);
-            assert_eq!(fieldnorms_reader.fieldnorm(0), 3);
-            assert_eq!(fieldnorms_reader.fieldnorm(1), 0);
-            assert_eq!(fieldnorms_reader.fieldnorm(2), 2);
+            let fieldnorms_reader = segment_reader.get_fieldnorms_reader(text_field).unwrap();
+            assert_eq!(fieldnorms_reader.get(0), 3);
+            assert_eq!(fieldnorms_reader.get(1), 0);
+            assert_eq!(fieldnorms_reader.get(2), 2);
        }
    }

-
-    fn advance_undeleted(docset: &mut DocSet, reader: &SegmentReader) -> bool {
-        while docset.advance() {
-            if !reader.is_deleted(docset.doc()) {
-                return true;
-            }
-        }
-        false
-    }
-
    #[test]
    fn test_delete_postings1() {
        let mut schema_builder = SchemaBuilder::default();
@@ -534,19 +488,19 @@ mod tests {
                let mut postings = inverted_index
                    .read_postings(&term_a, IndexRecordOption::WithFreqsAndPositions)
                    .unwrap();
-                assert!(advance_undeleted(&mut postings, reader));
+                assert!(postings.advance());
                assert_eq!(postings.doc(), 5);
-                assert!(!advance_undeleted(&mut postings, reader));
+                assert!(!postings.advance());
            }
            {
                let mut postings = inverted_index
                    .read_postings(&term_b, IndexRecordOption::WithFreqsAndPositions)
                    .unwrap();
-                assert!(advance_undeleted(&mut postings, reader));
+                assert!(postings.advance());
                assert_eq!(postings.doc(), 3);
-                assert!(advance_undeleted(&mut postings, reader));
+                assert!(postings.advance());
                assert_eq!(postings.doc(), 4);
-                assert!(!advance_undeleted(&mut postings, reader));
+                assert!(!postings.advance());
            }
        }
        {
@@ -578,19 +532,19 @@ mod tests {
                let mut postings = inverted_index
                    .read_postings(&term_a, IndexRecordOption::WithFreqsAndPositions)
                    .unwrap();
-                assert!(advance_undeleted(&mut postings, reader));
+                assert!(postings.advance());
                assert_eq!(postings.doc(), 5);
-                assert!(!advance_undeleted(&mut postings, reader));
+                assert!(!postings.advance());
            }
            {
                let mut postings = inverted_index
                    .read_postings(&term_b, IndexRecordOption::WithFreqsAndPositions)
                    .unwrap();
-                assert!(advance_undeleted(&mut postings, reader));
+                assert!(postings.advance());
                assert_eq!(postings.doc(), 3);
-                assert!(advance_undeleted(&mut postings, reader));
+                assert!(postings.advance());
                assert_eq!(postings.doc(), 4);
-                assert!(!advance_undeleted(&mut postings, reader));
+                assert!(!postings.advance());
            }
        }
        {
@@ -621,25 +575,25 @@ mod tests {
                let mut postings = inverted_index
                    .read_postings(&term_a, IndexRecordOption::WithFreqsAndPositions)
                    .unwrap();
-                assert!(!advance_undeleted(&mut postings, reader));
+                assert!(!postings.advance());
            }
            {
                let mut postings = inverted_index
                    .read_postings(&term_b, IndexRecordOption::WithFreqsAndPositions)
                    .unwrap();
-                assert!(advance_undeleted(&mut postings, reader));
+                assert!(postings.advance());
                assert_eq!(postings.doc(), 3);
-                assert!(advance_undeleted(&mut postings, reader));
+                assert!(postings.advance());
                assert_eq!(postings.doc(), 4);
-                assert!(!advance_undeleted(&mut postings, reader));
+                assert!(!postings.advance());
            }
            {
                let mut postings = inverted_index
                    .read_postings(&term_c, IndexRecordOption::WithFreqsAndPositions)
                    .unwrap();
-                assert!(advance_undeleted(&mut postings, reader));
+                assert!(postings.advance());
                assert_eq!(postings.doc(), 4);
-                assert!(!advance_undeleted(&mut postings, reader));
+                assert!(!postings.advance());
            }
        }
    }
--- a/src/postings/mod.rs
+++ b/src/postings/mod.rs
@@ -51,12 +51,12 @@ pub mod tests {
    use schema::IndexRecordOption;
    use std::iter;
    use datastruct::stacker::Heap;
+    use query::TermQuery;
    use schema::Field;
    use test::{self, Bencher};
    use indexer::operation::AddOperation;
    use tests;
    use rand::{Rng, SeedableRng, XorShiftRng};
-    use fieldnorm::FieldNormReader;

    #[test]
    pub fn test_position_write() {
@@ -67,12 +67,12 @@ pub mod tests {
        let mut segment = index.new_segment();
        let mut posting_serializer = InvertedIndexSerializer::open(&mut segment).unwrap();
        {
-            let mut field_serializer = posting_serializer.new_field(text_field, 120 * 4).unwrap();
+            let mut field_serializer = posting_serializer.new_field(text_field).unwrap();
            field_serializer.new_term("abc".as_bytes()).unwrap();
            for doc_id in 0u32..120u32 {
                let delta_positions = vec![1, 2, 3, 2];
                field_serializer
-                    .write_doc(doc_id, 4, &delta_positions)
+                    .write_doc(doc_id, 2, &delta_positions)
                    .unwrap();
            }
            field_serializer.close_term().unwrap();
@@ -97,69 +97,58 @@ pub mod tests {
        index_writer.add_document(doc!(title => r#"abc be be be be abc"#));
        index_writer.commit().unwrap();
        index.load_searchers().unwrap();
-
        let searcher = index.searcher();
-        let inverted_index = searcher.segment_reader(0u32).inverted_index(title);
-        let term = Term::from_field_text(title, "abc");
-
-        let mut positions = Vec::new();
-
+        let query = TermQuery::new(
+            Term::from_field_text(title, "abc"),
+            IndexRecordOption::WithFreqsAndPositions,
+        );
+        let weight = query.specialized_weight(&*searcher, true);
        {
-            let mut postings = inverted_index
-                .read_postings(&term, IndexRecordOption::WithFreqsAndPositions)
+            let mut scorer = weight
+                .specialized_scorer(searcher.segment_reader(0u32))
                .unwrap();
-            postings.advance();
-            postings.positions(&mut positions);
-            assert_eq!(&[0, 1, 2], &positions[..]);
-            postings.positions(&mut positions);
-            assert_eq!(&[0, 1, 2], &positions[..]);
-            postings.advance();
-            postings.positions(&mut positions);
-            assert_eq!(&[0, 5], &positions[..]);
+            scorer.advance();
+            assert_eq!(&[0, 1, 2], scorer.postings().positions());
+            scorer.advance();
+            assert_eq!(&[0, 5], scorer.postings().positions());
        }
        {
-            let mut postings = inverted_index
-                .read_postings(&term, IndexRecordOption::WithFreqsAndPositions)
+            let mut scorer = weight
+                .specialized_scorer(searcher.segment_reader(0u32))
                .unwrap();
-            postings.advance();
-            postings.advance();
-            postings.positions(&mut positions);
-            assert_eq!(&[0, 5], &positions[..]);
+            scorer.advance();
+            scorer.advance();
+            assert_eq!(&[0, 5], scorer.postings().positions());
        }
        {
-
-            let mut postings = inverted_index
-                .read_postings(&term, IndexRecordOption::WithFreqsAndPositions)
+            let mut scorer = weight
+                .specialized_scorer(searcher.segment_reader(0u32))
                .unwrap();
-            assert_eq!(postings.skip_next(1), SkipResult::Reached);
-            assert_eq!(postings.doc(), 1);
-            postings.positions(&mut positions);
-            assert_eq!(&[0, 5], &positions[..]);
+            assert_eq!(scorer.skip_next(1), SkipResult::Reached);
+            assert_eq!(scorer.doc(), 1);
+            assert_eq!(&[0, 5], scorer.postings().positions());
        }
        {
-            let mut postings = inverted_index
-                .read_postings(&term, IndexRecordOption::WithFreqsAndPositions)
+            let mut scorer = weight
+                .specialized_scorer(searcher.segment_reader(0u32))
                .unwrap();
-            assert_eq!(postings.skip_next(1002), SkipResult::Reached);
-            assert_eq!(postings.doc(), 1002);
-            postings.positions(&mut positions);
-            assert_eq!(&[0, 5], &positions[..]);
+            assert_eq!(scorer.skip_next(1002), SkipResult::Reached);
+            assert_eq!(scorer.doc(), 1002);
+            assert_eq!(&[0, 5], scorer.postings().positions());
        }
        {
-            let mut postings = inverted_index
-                .read_postings(&term, IndexRecordOption::WithFreqsAndPositions)
+            let mut scorer = weight
+                .specialized_scorer(searcher.segment_reader(0u32))
                .unwrap();
-            assert_eq!(postings.skip_next(100), SkipResult::Reached);
-            assert_eq!(postings.skip_next(1002), SkipResult::Reached);
-            assert_eq!(postings.doc(), 1002);
-            postings.positions(&mut positions);
-            assert_eq!(&[0, 5], &positions[..]);
+            assert_eq!(scorer.skip_next(100), SkipResult::Reached);
+            assert_eq!(scorer.skip_next(1002), SkipResult::Reached);
+            assert_eq!(scorer.doc(), 1002);
+            assert_eq!(&[0, 5], scorer.postings().positions());
        }
    }

    #[test]
    pub fn test_position_and_fieldnorm1() {
-        let mut positions = Vec::new();
        let mut schema_builder = SchemaBuilder::default();
        let text_field = schema_builder.add_text_field("text", TEXT);
        let schema = schema_builder.build();
@@ -206,13 +195,11 @@ pub mod tests {
        {
            let segment_reader = SegmentReader::open(&segment).unwrap();
            {
-                let fieldnorm_reader = segment_reader.get_fieldnorms_reader(text_field) ;
-                assert_eq!(fieldnorm_reader.fieldnorm(0), 8 + 5);
-                assert_eq!(fieldnorm_reader.fieldnorm(1), 2);
+                let fieldnorm_reader = segment_reader.get_fieldnorms_reader(text_field).unwrap();
+                assert_eq!(fieldnorm_reader.get(0), 8 + 5);
+                assert_eq!(fieldnorm_reader.get(1), 2);
                for i in 2..1000 {
-                    assert_eq!(
-                        fieldnorm_reader.fieldnorm_id(i),
-                        FieldNormReader::fieldnorm_to_id(i + 1) );
+                    assert_eq!(fieldnorm_reader.get(i), (i + 1) as u64);
                }
            }
            {
@@ -234,16 +221,15 @@ pub mod tests {
                assert!(postings_a.advance());
                assert_eq!(postings_a.doc(), 0);
                assert_eq!(postings_a.term_freq(), 6);
-                postings_a.positions(&mut positions);
-                assert_eq!(&positions[..], [0, 2, 4, 6, 7, 13]);
+                assert_eq!(postings_a.positions(), [0, 2, 4, 6, 7, 13]);
+                assert_eq!(postings_a.positions(), [0, 2, 4, 6, 7, 13]);
                assert!(postings_a.advance());
                assert_eq!(postings_a.doc(), 1u32);
                assert_eq!(postings_a.term_freq(), 1);
                for i in 2u32..1000u32 {
                    assert!(postings_a.advance());
                    assert_eq!(postings_a.term_freq(), 1);
-                    postings_a.positions(&mut positions);
-                    assert_eq!(&positions[..], [i]);
+                    assert_eq!(postings_a.positions(), [i]);
                    assert_eq!(postings_a.doc(), i);
                }
                assert!(!postings_a.advance());
@@ -258,7 +244,7 @@ pub mod tests {
                for i in 2u32..1000u32 {
                    assert!(postings_e.advance());
                    assert_eq!(postings_e.term_freq(), i);
-                    postings_e.positions(&mut positions);
+                    let positions = postings_e.positions();
                    assert_eq!(positions.len(), i as usize);
                    for j in 0..positions.len() {
                        assert_eq!(positions[j], (j as u32));
@@ -272,7 +258,6 @@ pub mod tests {

    #[test]
    pub fn test_position_and_fieldnorm2() {
-        let mut positions: Vec<u32> = Vec::new();
        let mut schema_builder = SchemaBuilder::default();
        let text_field = schema_builder.add_text_field("text", TEXT);
        let schema = schema_builder.build();
@@ -292,17 +277,18 @@ pub mod tests {
            assert!(index_writer.commit().is_ok());
        }
        index.load_searchers().unwrap();
-        let term_a = Term::from_field_text(text_field, "a");
+        let term_query = TermQuery::new(
+            Term::from_field_text(text_field, "a"),
+            IndexRecordOption::Basic,
+        );
        let searcher = index.searcher();
-        let segment_reader = searcher.segment_reader(0);
-        let mut postings = segment_reader
-            .inverted_index(text_field)
-            .read_postings(&term_a, IndexRecordOption::WithFreqsAndPositions)
-            .unwrap();
-        assert!(postings.advance());
-        assert_eq!(postings.doc(), 1u32);
-        postings.positions(&mut positions);
-        assert_eq!(&positions[..], &[1u32, 4]);
+        let mut term_weight = term_query.specialized_weight(&*searcher, true);
+        term_weight.index_record_option = IndexRecordOption::WithFreqsAndPositions;
+        let segment_reader = &searcher.segment_readers()[0];
+        let mut term_scorer = term_weight.specialized_scorer(segment_reader).unwrap();
+        assert!(term_scorer.advance());
+        assert_eq!(term_scorer.doc(), 1u32);
+        assert_eq!(term_scorer.postings().positions(), &[1u32, 4]);
    }

    #[test]
@@ -400,9 +386,11 @@ pub mod tests {
        {
            let mut index_writer = index.writer_with_num_threads(1, 40_000_000).unwrap();
            index_writer.delete_term(term_0);
+
            assert!(index_writer.commit().is_ok());
        }
        index.load_searchers().unwrap();
+
        let searcher = index.searcher();
        let segment_reader = searcher.segment_reader(0);

@@ -414,9 +402,8 @@ pub mod tests {
                .unwrap();

            if i % 2 == 0 {
-                assert_eq!(segment_postings.skip_next(i), SkipResult::Reached);
-                assert_eq!(segment_postings.doc(), i);
-                assert!(segment_reader.is_deleted(i));
+                assert_eq!(segment_postings.skip_next(i), SkipResult::OverStep);
+                assert_eq!(segment_postings.doc(), i + 1);
            } else {
                assert_eq!(segment_postings.skip_next(i), SkipResult::Reached);
                assert_eq!(segment_postings.doc(), i);
@@ -443,13 +430,14 @@ pub mod tests {
                last = cur;
                cur = next;
            }
+
            assert_eq!(cur, 377);
        }

        // delete everything else
        {
            let mut index_writer = index.writer_with_num_threads(1, 40_000_000).unwrap();
-                index_writer.delete_term(term_1);
+            index_writer.delete_term(term_1);

            assert!(index_writer.commit().is_ok());
        }
@@ -465,9 +453,7 @@ pub mod tests {
                .read_postings(&term_2, IndexRecordOption::Basic)
                .unwrap();

-            assert_eq!(segment_postings.skip_next(0), SkipResult::Reached);
-            assert_eq!(segment_postings.doc(), 0);
-            assert!(segment_reader.is_deleted(0));
+            assert_eq!(segment_postings.skip_next(0), SkipResult::End);

            let mut segment_postings = segment_reader
                .inverted_index(term_2.field())
@@ -565,7 +551,7 @@ pub mod tests {
                .inverted_index(TERM_D.field())
                .read_postings(&*TERM_D, IndexRecordOption::Basic)
                .unwrap();
-            let mut intersection = Intersection::new(vec![
+            let mut intersection = Intersection::from(vec![
                segment_postings_a,
                segment_postings_b,
                segment_postings_c,
--- a/src/postings/postings.rs
+++ b/src/postings/postings.rs
@@ -10,17 +10,10 @@ use docset::DocSet;
 /// Its main implementation is `SegmentPostings`,
 /// but other implementations mocking `SegmentPostings` exist,
 /// for merging segments or for testing.
-pub trait Postings: DocSet + 'static {
+pub trait Postings: DocSet {
    /// Returns the term frequency
    fn term_freq(&self) -> u32;
-
-    /// Returns the positions offseted with a given value.
-    /// The output vector will be resized to the `term_freq`.
-    fn positions_with_offset(&mut self, offset: u32, output: &mut Vec<u32>);
-
-    /// Returns the positions of the term in the given document.
-    /// The output vector will be resized to the `term_freq`.
-    fn positions(&mut self, output: &mut Vec<u32>) {
-        self.positions_with_offset(0u32, output);
-    }
+    /// Returns the list of positions of the term, expressed as a list of
+    /// token ordinals.
+    fn positions(&self) -> &[u32];
 }
--- a/src/postings/postings_writer.rs
+++ b/src/postings/postings_writer.rs
@@ -16,6 +16,7 @@ use tokenizer::Token;
 use tokenizer::TokenStream;
 use schema::IndexRecordOption;
 use postings::UnorderedTermId;
+use termdict::TermOrdinal;

 fn posting_from_field_entry<'a>(
    field_entry: &FieldEntry,
@@ -44,6 +45,7 @@ fn posting_from_field_entry<'a>(

 pub struct MultiFieldPostingsWriter<'a> {
    heap: &'a Heap,
+    schema: Schema,
    term_index: TermHashMap<'a>,
    per_field_postings_writers: Vec<Box<PostingsWriter + 'a>>,
 }
@@ -58,8 +60,8 @@ impl<'a> MultiFieldPostingsWriter<'a> {
            .iter()
            .map(|field_entry| posting_from_field_entry(field_entry, heap))
            .collect();
-
        MultiFieldPostingsWriter {
+            schema: schema.clone(),
            heap,
            term_index,
            per_field_postings_writers,
@@ -83,7 +85,7 @@ impl<'a> MultiFieldPostingsWriter<'a> {
    pub fn serialize(
        &self,
        serializer: &mut InvertedIndexSerializer,
-    ) -> Result<HashMap<Field, HashMap<UnorderedTermId, usize>>> {
+    ) -> Result<HashMap<Field, HashMap<UnorderedTermId, TermOrdinal>>> {
        let mut term_offsets: Vec<(&[u8], u32, UnorderedTermId)> = self.term_index.iter().collect();
        term_offsets.sort_by_key(|&(k, _, _)| k);

@@ -94,7 +96,7 @@ impl<'a> MultiFieldPostingsWriter<'a> {
            .map(|(key, _, _)| Term::wrap(key).field())
            .enumerate();

-        let mut unordered_term_mappings: HashMap<Field, HashMap<UnorderedTermId, usize>> =
+        let mut unordered_term_mappings: HashMap<Field, HashMap<UnorderedTermId, TermOrdinal>> =
            HashMap::new();

        let mut prev_field = Field(u32::max_value());
@@ -110,20 +112,26 @@ impl<'a> MultiFieldPostingsWriter<'a> {
            let (field, start) = offsets[i];
            let (_, stop) = offsets[i + 1];

-            // populating the unordered term ord -> ordered term ord mapping
-            // for the field.
-            let mut mapping = HashMap::new();
-            for (term_ord, term_unord_id) in term_offsets[start..stop]
-                .iter()
-                .map(|&(_, _, bucket)| bucket)
-                .enumerate()
-            {
-                mapping.insert(term_unord_id, term_ord);
+            let field_entry = self.schema.get_field_entry(field);
+
+            match field_entry.field_type() {
+                FieldType::Str(_) | FieldType::HierarchicalFacet => {
+                    // populating the (unordered term ord) -> (ordered term ord) mapping
+                    // for the field.
+                    let mut unordered_term_ids = term_offsets[start..stop]
+                        .iter()
+                        .map(|&(_, _, bucket)| bucket);
+                    let mut mapping: HashMap<UnorderedTermId, TermOrdinal> = unordered_term_ids
+                        .enumerate()
+                        .map(|(term_ord, unord_term_id)| (unord_term_id as UnorderedTermId, term_ord as TermOrdinal))
+                        .collect();
+                    unordered_term_mappings.insert(field, mapping);
+                }
+                FieldType::U64(_) | FieldType::I64(_) => {}
            }
-            unordered_term_mappings.insert(field, mapping);

            let postings_writer = &self.per_field_postings_writers[field.0 as usize];
-            let mut field_serializer = serializer.new_field(field, postings_writer.total_num_tokens())?;
+            let mut field_serializer = serializer.new_field(field)?;
            postings_writer.serialize(
                &term_offsets[start..stop],
                &mut field_serializer,
@@ -181,24 +189,18 @@ pub trait PostingsWriter {
    ) -> u32 {
        let mut term = unsafe { Term::with_capacity(100) };
        term.set_field(field);
-        let num_tokens = {
-            let mut sink = |token: &Token| {
-                term.set_text(token.text.as_str());
-                self.subscribe(term_index, doc_id, token.position as u32, &term, heap);
-            };
-            token_stream.process(&mut sink)
+        let mut sink = |token: &Token| {
+            term.set_text(token.text.as_str());
+            self.subscribe(term_index, doc_id, token.position as u32, &term, heap);
        };
-        num_tokens
+        token_stream.process(&mut sink)
    }
-
-    fn total_num_tokens(&self) -> u64;
 }

 /// The `SpecializedPostingsWriter` is just here to remove dynamic
 /// dispatch to the recorder information.
 pub struct SpecializedPostingsWriter<'a, Rec: Recorder + 'static> {
    heap: &'a Heap,
-    total_num_tokens: u64,
    _recorder_type: PhantomData<Rec>,
 }

@@ -207,7 +209,6 @@ impl<'a, Rec: Recorder + 'static> SpecializedPostingsWriter<'a, Rec> {
    pub fn new(heap: &'a Heap) -> SpecializedPostingsWriter<'a, Rec> {
        SpecializedPostingsWriter {
            heap,
-            total_num_tokens: 0u64,
            _recorder_type: PhantomData,
        }
    }
@@ -236,7 +237,6 @@ impl<'a, Rec: Recorder + 'static> PostingsWriter for SpecializedPostingsWriter<'
            }
            recorder.new_doc(doc, heap);
        }
-        self.total_num_tokens += 1;
        recorder.record_position(position, heap);
        term_ord
    }
@@ -255,8 +255,4 @@ impl<'a, Rec: Recorder + 'static> PostingsWriter for SpecializedPostingsWriter<'
        }
        Ok(())
    }
-
-    fn total_num_tokens(&self) -> u64 {
-        self.total_num_tokens
-    }
 }
--- a/src/postings/segment_postings.rs
+++ b/src/postings/segment_postings.rs
@@ -5,12 +5,16 @@ use common::BitSet;
 use common::HasLen;
 use postings::Postings;
 use docset::{DocSet, SkipResult};
+use std::cmp;
 use fst::Streamer;
 use compression::compressed_block_size;
+use fastfield::DeleteBitSet;
+use std::cell::UnsafeCell;
 use directory::{ReadOnlySource, SourceRead};
 use postings::FreqReadingOption;
 use postings::serializer::PostingsSerializer;
-use common::CountingWriter;
+
+const EMPTY_POSITIONS: [u32; 0] = [0u32; 0];

 struct PositionComputer {
    // store the amount of position int
@@ -18,32 +22,42 @@ struct PositionComputer {
    //
    // if none, position are already loaded in
    // the positions vec.
-    position_to_skip: usize,
+    position_to_skip: Option<usize>,
+    positions: Vec<u32>,
    positions_stream: CompressedIntStream,
 }

 impl PositionComputer {
    pub fn new(positions_stream: CompressedIntStream) -> PositionComputer {
        PositionComputer {
-            position_to_skip: 0,
+            position_to_skip: None,
+            positions: vec![],
            positions_stream,
        }
    }

    pub fn add_skip(&mut self, num_skip: usize) {
-        self.position_to_skip += num_skip;
+        self.position_to_skip = Some(
+            self.position_to_skip
+                .map(|prev_skip| prev_skip + num_skip)
+                .unwrap_or(0),
+        );
    }

-    // Positions can only be read once.
-    pub fn positions_with_offset(&mut self, offset: u32, output: &mut [u32]) {
-        self.positions_stream.skip(self.position_to_skip);
-        self.position_to_skip = 0;
-        self.positions_stream.read(output);
-        let mut cum = offset;
-        for output_mut in output.iter_mut() {
-            cum += *output_mut;
-            *output_mut = cum;
+    pub fn positions(&mut self, term_freq: usize) -> &[u32] {
+        if let Some(num_skip) = self.position_to_skip {
+            self.positions.resize(term_freq, 0u32);
+            self.positions_stream.skip(num_skip);
+            self.positions_stream.read(&mut self.positions[..term_freq]);
+
+            let mut cum = 0u32;
+            for i in 0..term_freq as usize {
+                cum += self.positions[i];
+                self.positions[i] = cum;
+            }
+            self.position_to_skip = None;
        }
+        &self.positions[..term_freq]
    }
 }

@@ -55,20 +69,11 @@ impl PositionComputer {
 pub struct SegmentPostings {
    block_cursor: BlockSegmentPostings,
    cur: usize,
-    position_computer: Option<PositionComputer>,
+    delete_bitset: DeleteBitSet,
+    position_computer: Option<UnsafeCell<PositionComputer>>,
 }

 impl SegmentPostings {
-    /// Returns an empty segment postings object
-    pub fn empty() -> Self {
-        let empty_block_cursor = BlockSegmentPostings::empty();
-        SegmentPostings {
-            block_cursor: empty_block_cursor,
-            cur: COMPRESSION_BLOCK_SIZE,
-            position_computer: None,
-        }
-    }
-
    /// Creates a segment postings object with the given documents
    /// and no frequency encoded.
    ///
@@ -78,26 +83,22 @@ impl SegmentPostings {
    /// and returns a `SegmentPostings` object that embeds a
    /// buffer with the serialized data.
    pub fn create_from_docs(docs: &[u32]) -> SegmentPostings {
-        let mut counting_writer = CountingWriter::wrap(Vec::new());
+        let mut buffer = Vec::new();
        {
-            let mut postings_serializer = PostingsSerializer::new(&mut counting_writer, false);
+            let mut postings_serializer = PostingsSerializer::new(&mut buffer, false);
            for &doc in docs {
                postings_serializer.write_doc(doc, 1u32).unwrap();
            }
-            postings_serializer.close_term().expect("In memory Serialization should never fail.");
+            postings_serializer.close_term().unwrap();
        }
-        let (buffer , _) = counting_writer.finish().expect("Serializing in a buffer should never fail.");
        let data = ReadOnlySource::from(buffer);
        let block_segment_postings = BlockSegmentPostings::from_data(
            docs.len(),
            SourceRead::from(data),
            FreqReadingOption::NoFreq,
        );
-        SegmentPostings::from_block_postings(block_segment_postings, None)
+        SegmentPostings::from_block_postings(block_segment_postings, DeleteBitSet::empty(), None)
    }
-}
-
-impl SegmentPostings {

    /// Reads a Segment postings from an &[u8]
    ///
@@ -107,42 +108,65 @@ impl SegmentPostings {
    ///   frequencies and/or positions
    pub fn from_block_postings(
        segment_block_postings: BlockSegmentPostings,
+        delete_bitset: DeleteBitSet,
        positions_stream_opt: Option<CompressedIntStream>,
    ) -> SegmentPostings {
+        let position_computer =
+            positions_stream_opt.map(|stream| UnsafeCell::new(PositionComputer::new(stream)));
        SegmentPostings {
            block_cursor: segment_block_postings,
            cur: COMPRESSION_BLOCK_SIZE, // cursor within the block
-            position_computer: positions_stream_opt.map(PositionComputer::new),
+            delete_bitset,
+            position_computer,
        }
    }
-}

-fn exponential_search(target: u32, mut start: usize, arr: &[u32]) -> (usize, usize) {
-    let end = arr.len();
-    debug_assert!(target >= arr[start]);
-    debug_assert!(target <= arr[end - 1]);
-    let mut jump = 1;
-    loop {
-        let new = start + jump;
-        if new >= end {
-            return (start, end)
+    /// Returns an empty segment postings object
+    pub fn empty() -> SegmentPostings {
+        let empty_block_cursor = BlockSegmentPostings::empty();
+        SegmentPostings {
+            block_cursor: empty_block_cursor,
+            delete_bitset: DeleteBitSet::empty(),
+            cur: COMPRESSION_BLOCK_SIZE,
+            position_computer: None,
        }
-        if arr[new] > target {
-            return (start, new);
+    }
+
+    fn position_add_skip<F: FnOnce() -> usize>(&self, num_skips_fn: F) {
+        if let Some(position_computer) = self.position_computer.as_ref() {
+            let num_skips = num_skips_fn();
+            unsafe {
+                (*position_computer.get()).add_skip(num_skips);
+            }
        }
-        start = new;
-        jump *= 2;
    }
 }

 impl DocSet for SegmentPostings {
+    // goes to the next element.
+    // next needs to be called a first time to point to the correct element.
+    #[inline]
+    fn advance(&mut self) -> bool {
+        loop {
+            self.position_add_skip(|| self.term_freq() as usize);
+            self.cur += 1;
+            if self.cur >= self.block_cursor.block_len() {
+                self.cur = 0;
+                if !self.block_cursor.advance() {
+                    self.cur = COMPRESSION_BLOCK_SIZE;
+                    return false;
+                }
+            }
+            if !self.delete_bitset.is_deleted(self.doc()) {
+                return true;
+            }
+        }
+    }
+
    fn skip_next(&mut self, target: DocId) -> SkipResult {
        if !self.advance() {
            return SkipResult::End;
        }
-        if self.doc() == target {
-            return SkipResult::Reached;
-        }

        // in the following, thanks to the call to advance above,
        // we know that the position is not loaded and we need
@@ -160,16 +184,17 @@ impl DocSet for SegmentPostings {
                // so that position_add_skip will decide if it should
                // just set itself to Some(0) or effectively
                // add the term freq.
-                if self.position_computer.is_some() {
+                //let num_skips: u32 = ;
+                self.position_add_skip(|| {
                    let freqs_skipped = &self.block_cursor.freqs()[self.cur..];
-                    let sum_freq: u32 = freqs_skipped.iter().sum();
-                    self.position_computer.as_mut()
-                        .unwrap()
-                        .add_skip(sum_freq as usize);
-                }
+                    let sum_freq: u32 = freqs_skipped.iter().cloned().sum();
+                    sum_freq as usize
+                });
+
                if !self.block_cursor.advance() {
                    return SkipResult::End;
                }
+
                self.cur = 0;
            } else {
                if target < current_doc {
@@ -181,54 +206,66 @@ impl DocSet for SegmentPostings {
                break;
            }
        }
+        {
+            // we're in the right block now, start with an exponential search
+            let block_docs = self.block_cursor.docs();
+            let block_len = block_docs.len();

-        // we're in the right block now, start with an exponential search
-        let block_docs = self.block_cursor.docs();
+            debug_assert!(target >= block_docs[self.cur]);
+            debug_assert!(target <= block_docs[block_len - 1]);

-        let (mut start, end) = exponential_search(target, self.cur, block_docs);
+            let mut start = self.cur;
+            let mut end = block_len;
+            let mut count = 1;
+            loop {
+                let new = start + count;
+                if new < end && block_docs[new] < target {
+                    start = new;
+                    count *= 2;
+                } else {
+                    break;
+                }
+            }
+            end = cmp::min(start + count, end);

-        start += block_docs[start..end]
-            .binary_search(&target)
-            .unwrap_or_else(|e| e);
+            // now do a binary search
+            let mut count = end - start;
+            while count > 0 {
+                let step = count / 2;
+                let mid = start + step;
+                let doc = block_docs[mid];
+                if doc < target {
+                    start = mid + 1;
+                    count -= step + 1;
+                } else {
+                    count = step;
+                }
+            }

-        // `doc` is now the first element >= `target`
-        let doc = block_docs[start];
-        debug_assert!(doc >= target);
+            // `doc` is now >= `target`
+            let doc = block_docs[start];

-        if self.position_computer.is_some() {
-            let freqs_skipped = &self.block_cursor.freqs()[self.cur..start];
-            let sum_freqs: u32 = freqs_skipped.iter().sum();
-            self.position_computer.as_mut()
-                .unwrap()
-                .add_skip(sum_freqs as usize);
-        }
+            self.position_add_skip(|| {
+                let freqs_skipped = &self.block_cursor.freqs()[self.cur..start];
+                let sum_freqs: u32 = freqs_skipped.iter().sum();
+                sum_freqs as usize
+            });

-        self.cur = start;
-        if doc == target {
-            return SkipResult::Reached;
-        } else {
-            return SkipResult::OverStep;
-        }
-    }
+            self.cur = start;

-
-    // goes to the next element.
-    // next needs to be called a first time to point to the correct element.
-    #[inline]
-    fn advance(&mut self) -> bool {
-        if self.position_computer.is_some() {
-            let term_freq = self.term_freq() as usize;
-            self.position_computer.as_mut().unwrap().add_skip(term_freq);
-        }
-        self.cur += 1;
-        if self.cur >= self.block_cursor.block_len() {
-            self.cur = 0;
-            if !self.block_cursor.advance() {
-                self.cur = COMPRESSION_BLOCK_SIZE;
-                return false;
+            if !self.delete_bitset.is_deleted(doc) {
+                if doc == target {
+                    return SkipResult::Reached;
+                } else {
+                    return SkipResult::OverStep;
+                }
            }
        }
-        true
+        if self.advance() {
+            SkipResult::OverStep
+        } else {
+            SkipResult::End
+        }
    }

    fn size_hint(&self) -> u32 {
@@ -262,7 +299,6 @@ impl DocSet for SegmentPostings {
    }
 }

-
 impl HasLen for SegmentPostings {
    fn len(&self) -> usize {
        self.block_cursor.doc_freq()
@@ -274,21 +310,14 @@ impl Postings for SegmentPostings {
        self.block_cursor.freq(self.cur)
    }

-    fn positions_with_offset(&mut self, offset: u32, output: &mut Vec<u32>) {
-        if self.position_computer.is_some() {
-            let prev_capacity = output.capacity();
-            let term_freq = self.term_freq() as usize;
-            if term_freq > prev_capacity {
-                let additional_len = term_freq - output.len();
-                output.reserve(additional_len);
-            }
-            unsafe {
-                output.set_len(term_freq);
-                self.position_computer.as_mut().unwrap().positions_with_offset(offset, &mut output[..])
-            }
-        } else {
-            output.clear();
-        }
+    fn positions(&self) -> &[u32] {
+        let term_freq = self.term_freq();
+        self.position_computer
+            .as_ref()
+            .map(|position_computer| unsafe {
+                (&mut *position_computer.get()).positions(term_freq as usize)
+            })
+            .unwrap_or(&EMPTY_POSITIONS[..])
    }
 }

@@ -570,4 +599,3 @@ mod tests {
        assert_eq!(block_segments.docs(), &[1, 3, 5]);
    }
 }
-
--- a/src/postings/serializer.rs
+++ b/src/postings/serializer.rs
@@ -11,10 +11,9 @@ use DocId;
 use core::Segment;
 use std::io::{self, Write};
 use compression::VIntEncoder;
-use common::BinarySerializable;
 use common::CountingWriter;
 use common::CompositeWrite;
-use termdict::TermDictionaryBuilder;
+use termdict::{TermOrdinal, TermDictionaryBuilder};

 /// `PostingsSerializer` is in charge of serializing
 /// postings on disk, in the
@@ -85,11 +84,10 @@ impl InvertedIndexSerializer {
    /// a given field.
    ///
    /// Loads the indexing options for the given field.
-    pub fn new_field(&mut self, field: Field, total_num_tokens: u64) -> io::Result<FieldSerializer> {
+    pub fn new_field(&mut self, field: Field) -> io::Result<FieldSerializer> {
        let field_entry: &FieldEntry = self.schema.get_field_entry(field);
        let term_dictionary_write = self.terms_write.for_field(field);
        let postings_write = self.postings_write.for_field(field);
-        total_num_tokens.serialize(postings_write)?;
        let positions_write = self.positions_write.for_field(field);
        FieldSerializer::new(
            field_entry.field_type().clone(),
@@ -116,6 +114,7 @@ pub struct FieldSerializer<'a> {
    positions_serializer_opt: Option<PositionSerializer<&'a mut CountingWriter<WritePtr>>>,
    current_term_info: TermInfo,
    term_open: bool,
+    num_terms: TermOrdinal,
 }

 impl<'a> FieldSerializer<'a> {
@@ -125,7 +124,6 @@ impl<'a> FieldSerializer<'a> {
        postings_write: &'a mut CountingWriter<WritePtr>,
        positions_write: &'a mut CountingWriter<WritePtr>,
    ) -> io::Result<FieldSerializer<'a>> {
-
        let (term_freq_enabled, position_enabled): (bool, bool) = match field_type {
            FieldType::Str(ref text_options) => {
                if let Some(text_indexing_options) = text_options.get_indexing_options() {
@@ -155,6 +153,7 @@ impl<'a> FieldSerializer<'a> {
            positions_serializer_opt,
            current_term_info: TermInfo::default(),
            term_open: false,
+            num_terms: TermOrdinal::default(),
        })
    }

@@ -175,7 +174,7 @@ impl<'a> FieldSerializer<'a> {
    /// * term - the term. It needs to come after the previous term according
    ///   to the lexicographical order.
    /// * doc_freq - return the number of document containing the term.
-    pub fn new_term(&mut self, term: &[u8]) -> io::Result<()> {
+    pub fn new_term(&mut self, term: &[u8]) -> io::Result<TermOrdinal> {
        assert!(
            !self.term_open,
            "Called new_term, while the previous term was not closed."
@@ -183,7 +182,10 @@ impl<'a> FieldSerializer<'a> {
        self.term_open = true;
        self.postings_serializer.clear();
        self.current_term_info = self.current_term_info();
-        self.term_dictionary_builder.insert_key(term)
+        self.term_dictionary_builder.insert_key(term)?;
+        let term_ordinal = self.num_terms;
+        self.num_terms += 1;
+        Ok(term_ordinal)
    }

    /// Serialize the information that a document contains the current term,
--- a/src/query/all_query.rs
+++ b/src/query/all_query.rs
@@ -7,6 +7,7 @@ use Result;
 use Score;
 use DocId;
 use core::Searcher;
+use fastfield::DeleteBitSet;

 /// Query that matches all of the documents.
 ///
@@ -26,28 +27,52 @@ pub struct AllWeight;
 impl Weight for AllWeight {
    fn scorer(&self, reader: &SegmentReader) -> Result<Box<Scorer>> {
        Ok(box AllScorer {
-            started: false,
+            state: State::NotStarted,
            doc: 0u32,
            max_doc: reader.max_doc(),
+            deleted_bitset: reader.delete_bitset().clone()
        })
    }
 }

+enum State {
+    NotStarted,
+    Started,
+    Finished
+}
+
 /// Scorer associated to the `AllQuery` query.
 pub struct AllScorer {
-    started: bool,
+    state: State,
    doc: DocId,
    max_doc: DocId,
+    deleted_bitset: DeleteBitSet
 }

 impl DocSet for AllScorer {
    fn advance(&mut self) -> bool {
-        if self.started {
-            self.doc += 1u32;
-        } else {
-            self.started = true;
+        loop {
+            match self.state {
+                State::NotStarted => {
+                    self.state = State::Started;
+                    self.doc = 0;
+                }
+                State::Started => {
+                    self.doc += 1u32;
+                }
+                State::Finished => {
+                    return false;
+                }
+            }
+            if self.doc < self.max_doc {
+                if !self.deleted_bitset.is_deleted(self.doc) {
+                    return true;
+                }
+            } else {
+                self.state = State::Finished;
+                return false;
+            }
        }
-        self.doc < self.max_doc
    }

    fn doc(&self) -> DocId {
--- a/src/query/bitset/mod.rs
+++ b/src/query/bitset/mod.rs
@@ -244,7 +244,7 @@ mod tests {

    #[bench]
    fn bench_bitset_1pct_clone_iterate(b: &mut test::Bencher) {
-        let els = tests::sample(1_000_000u32, 0.01);
+        let els = tests::generate_nonunique_unsorted(1_000_000u32, 10_000);
        let mut bitset = BitSet::with_max_value(1_000_000);
        for el in els {
            bitset.insert(el);
--- a/src/query/bm25.rs
+++ b/src/query/bm25.rs
@@ -1,94 +0,0 @@
-use fieldnorm::FieldNormReader;
-use Term;
-use Searcher;
-use Score;
-
-const K1: f32 = 1.2;
-const B: f32 = 0.75;
-
-fn idf(doc_freq: u64, doc_count: u64) -> f32 {
-    let x = ((doc_count - doc_freq) as f32 + 0.5) / (doc_freq as f32 + 0.5);
-    (1f32 + x).ln()
-}
-
-
-fn cached_tf_component(fieldnorm: u32, average_fieldnorm: f32) -> f32 {
-    K1 * (1f32 - B + B * fieldnorm as f32 / average_fieldnorm)
-}
-
-fn compute_tf_cache(average_fieldnorm: f32) -> [f32; 256] {
-    let mut cache = [0f32; 256];
-    for fieldnorm_id in 0..256 {
-        let fieldnorm = FieldNormReader::id_to_fieldnorm(fieldnorm_id as u8);
-        cache[fieldnorm_id] = cached_tf_component(fieldnorm, average_fieldnorm);
-    }
-    cache
-}
-
-#[derive(Clone)]
-pub struct BM25Weight {
-    weight: f32,
-    cache: [f32; 256],
-}
-
-impl BM25Weight {
-
-    pub fn null() -> BM25Weight {
-        BM25Weight {
-            weight: 0f32,
-            cache: [1f32; 256]
-        }
-    }
-
-    pub fn for_terms(searcher: &Searcher, terms: &[Term]) -> BM25Weight {
-        assert!(!terms.is_empty(), "BM25 requires at least one term");
-        let field = terms[0].field();
-        for term in &terms[1..] {
-            assert_eq!(term.field(), field, "All terms must belong to the same field.");
-        }
-
-        let mut total_num_tokens = 0u64;
-        let mut total_num_docs = 0u64;
-        for segment_reader in searcher.segment_readers() {
-            let inverted_index = segment_reader.inverted_index(field);
-            total_num_tokens += inverted_index.total_num_tokens();
-            total_num_docs += segment_reader.max_doc() as u64;
-        }
-        let average_fieldnorm = total_num_tokens as f32 / total_num_docs as f32;
-
-        let idf = terms.iter()
-            .map(|term| {
-                let term_doc_freq = searcher.doc_freq(term);
-                idf(term_doc_freq, total_num_docs)
-            })
-            .sum::<f32>();
-        BM25Weight::new(idf, average_fieldnorm)
-    }
-
-    fn new(idf: f32, average_fieldnorm: f32) -> BM25Weight {
-        BM25Weight {
-            weight: idf * (1f32 + K1),
-            cache: compute_tf_cache(average_fieldnorm),
-        }
-    }
-
-    #[inline(always)]
-    pub fn score(&self, fieldnorm_id: u8, term_freq: u32) -> Score {
-        let norm = self.cache[fieldnorm_id as usize];
-        let term_freq = term_freq as f32;
-        self.weight * term_freq / (term_freq + norm)
-    }
-}
-
-#[cfg(test)]
-mod tests {
-
-    use tests::assert_nearly_equals;
-    use super::idf;
-
-    #[test]
-    fn test_idf() {
-        assert_nearly_equals(idf(1, 2),  0.6931472);
-    }
-
-}
--- a/Show More
+++ b/Show More
Author	SHA1	Message	Date
Paul Masurel	d40ef06dde	Edited changelog and bumped version. This branch is to be published as a hotfix for 5.1.0. Closes #280 Closes #274 Closes #289	2018-05-05 21:00:10 -07:00
Paul Masurel	384917c17b	Added comments from code review.	2018-05-05 20:51:40 -07:00
Paul Masurel	cbca95aee3	Removed large block	2018-05-02 23:32:27 -07:00
Paul Masurel	2b8618afc2	Added unit test for multivalued u64 fastfields. u64 fastfields are not dictionary encoded.	2018-05-02 22:33:38 -07:00
Paul Masurel	967cf2cb02	AllQuery handling deletes, better tests	2018-05-02 11:42:07 -07:00
Paul Masurel	0e68c4ac34	Test passing.	2018-05-01 13:42:31 -07:00
Paul Masurel	e09192b0ab	issue-274 Added unit test testing if facet handle merging	2018-05-01 10:28:34 -07:00