Various changes. Need to cherrypick some of them and put them into master

NOBUG common crawl, streamdict works with 64 bits (hopefully)
2026-06-07 19:10:42 +00:00 · 2017-12-25 10:35:10 +09:00 · 2017-12-21 22:44:50 +09:00
185 changed files with 100494 additions and 8773 deletions
--- a/.gitignore
+++ b/.gitignore
@@ -1,4 +1,3 @@
-*.swp
 target
 target/debug
 .vscode
@@ -9,4 +8,4 @@ benchmark
 cpp/simdcomp/bitpackingbenchmark
 *.bk
 .idea
-trace.dat
+trace.dat
--- a/.travis.yml
+++ b/.travis.yml
@@ -1,6 +1,4 @@
 language: rust
-sudo: required
-cache: cargo
 rust:
  - nightly
 env:
@@ -13,7 +11,6 @@ addons:
  apt:
    sources:
      - ubuntu-toolchain-r-test
-      - kalakris-cmake
    packages:
      - gcc-4.8
      - g++-4.8
@@ -21,17 +18,18 @@ addons:
      - libelf-dev
      - libdw-dev
      - binutils-dev
-      - cmake
 before_script:
-  - export PATH=$HOME/.cargo/bin:$PATH
-  - cargo install cargo-update || echo "cargo-update already installed"
-  - cargo install cargo-travis || echo "cargo-travis already installed"
+  - |
+    pip install 'travis-cargo<0.2' --user &&
+    export PATH=$HOME/.local/bin:$PATH
 script:
-  - cargo build
-  - cargo test
-  - cargo test -- --ignored
+  - |
+    travis-cargo build &&
+    travis-cargo test &&
+    travis-cargo bench
  - cargo run --example simple_search
-  - cargo doc
 after_success:
-  - cargo coveralls --exclude-pattern src/functional_test.rs
-  - cargo doc-upload
+  - bash ./script/build-doc.sh
+  - travis-cargo doc-upload
+  - if [[ "$TRAVIS_OS_NAME" == "linux" ]]; then travis-cargo coveralls --no-sudo --verify; fi
+  - if [[ "$TRAVIS_OS_NAME" == "linux" ]]; then ./kcov/build/src/kcov --verify --coveralls-id=$TRAVIS_JOB_ID --include-path=`pwd`/src --exclude-path=`pwd`/cpp --exclude-pattern=/.cargo target/kcov target/debug/tantivy-*; fi
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,27 +1,3 @@
-Tantivy 0.5.2
-==========================
-
- Removed C code. Tantivy is now pure Rust.
- BM25
- Approximate field norms encoded over 1 byte.
-
-Tantivy 0.5.1
-==========================
- bugfix #254 : tantivy failed if no documents in a segment contained a specific field.
-
-
-Tantivy 0.5
-==========================
- Faceting
- RangeQuery
- Configurable tokenization pipeline
- Bugfix in PhraseQuery
- Various query optimisation
- Allowing very large indexes
-    - 64 bits file address
-    - Smarter encoding of the `TermInfo` objects
-
-

 Tantivy 0.4.3
 ==========================
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -1,7 +1,8 @@
 [package]
 name = "tantivy"
-version = "0.5.1"
+version = "0.5.0-dev"
 authors = ["Paul Masurel <paul.masurel@gmail.com>"]
+build = "build.rs"
 license = "MIT"
 categories = ["database-implementations", "data-structures"]
 description = """Tantivy is a search engine library."""
@@ -13,22 +14,29 @@ keywords = ["search", "information", "retrieval"]

 [dependencies]
 byteorder = "1.0"
+memmap = "0.4"
 lazy_static = "0.2.1"
 tinysegmenter = "0.1.0"
 regex = "0.2"
-fst = {version="0.2", default-features=false}
-atomicwrites = {version="0.1", optional=true}
+fst = "0.1.37"
+atomicwrites = "0.1.3"
+tempfile = "2.1"
 log = "0.3.6"
 combine = "2.2"
 tempdir = "0.3"
 serde = "1.0"
 serde_derive = "1.0"
 serde_json = "1.0"
+bincode = "0.8"
+libc = {version = "0.2.20", optional=true}
 num_cpus = "1.2"
 itertools = "0.5.9"
+lz4 = "1.20"
 bit-set = "0.4.0"
-uuid = { version = "0.6", features = ["v4", "serde"] }
+time = "0.1"
+uuid = { version = "0.5", features = ["v4", "serde"] }
 chan = "0.1"
+version = "2"
 crossbeam = "0.3"
 futures = "0.1"
 futures-cpupool = "0.1"
@@ -36,19 +44,17 @@ error-chain = "0.8"
 owning_ref = "0.3"
 stable_deref_trait = "1.0.0"
 rust-stemmers = "0.1.0"
-downcast = { version="0.9", features = ["nightly"]}
-matches = "0.1"
-snap = "0.2"
-bitpacking = {path = "../bitpacking"}

 [target.'cfg(windows)'.dependencies]
 winapi = "0.2"

 [dev-dependencies]
 rand = "0.3"
-tempfile = "2.1"
 env_logger = "0.4"

+[build-dependencies]
+cc = {version = "1.0.0", optional=true}
+
 [profile.release]
 opt-level = 3
 debug = false
@@ -57,23 +63,10 @@ debug-assertions = false


 [features]
-default = ["mmap"]
+default = ["simdcompression"]
+simdcompression = ["libc", "cc"]
 streamdict = []
-mmap = ["fst/mmap", "atomicwrites"]


 [badges]
 travis-ci = { repository = "tantivy-search/tantivy" }
-
-[[example]]
-name = "simple_search"
-required-features = ["mmap"]
-
-
-[[bin]]
-name = "convert_to_static"
-path = "./bin/convert_to_static.rs"
-
-[[bin]]
-name = "test_static_dir"
-path = "./bin/test_static_dir.rs"
--- a/README.md
+++ b/README.md
@@ -5,26 +5,25 @@
 [![Join the chat at https://gitter.im/tantivy-search/tantivy](https://badges.gitter.im/tantivy-search/tantivy.svg)](https://gitter.im/tantivy-search/tantivy?utm_source=badge&utm_medium=badge&utm_campaign=pr-badge&utm_content=badge)
 [![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](https://opensource.org/licenses/MIT)
 [![Build status](https://ci.appveyor.com/api/projects/status/r7nb13kj23u8m9pj?svg=true)](https://ci.appveyor.com/project/fulmicoton/tantivy)
+![beacon for google analytics](https://ga-beacon.appspot.com/UA-88834340-1/tantivy/README)

 **Tantivy** is a **full text search engine library** written in rust.

 It is strongly inspired by Lucene's design.

+
 # Features

- Tiny startup time (<10ms), perfect for command line tools
+- configurable indexing (optional term frequency and position indexing)
 - tf-idf scoring
 - Basic query language
 - Phrase queries
 - Incremental indexing
 - Multithreaded indexing (indexing English Wikipedia takes < 3 minutes on my desktop)
- Mmap directory
+- mmap based
 - optional SIMD integer compression
- Single valued and multivalued u64 and i64 fast fields (equivalent of doc values in Lucene)
+- u64 and i64 fast fields (equivalent of doc values in Lucene)
 - LZ4 compressed document store
- Range queries
- Faceting
- configurable indexing (optional term frequency and position indexing
 - Cheesy logo with a horse

 Tantivy supports Linux, MacOS and Windows.
@@ -41,38 +40,14 @@ It will walk you through getting a wikipedia search engine up and running in a f

 # Compiling

-## Development
-
-Tantivy requires Rust Nightly because it uses requires the features [`box_syntax`](https://doc.rust-lang.org/stable/unstable-book/language-features/box-syntax.html), [`optin_builtin_traits`](https://github.com/rust-lang/rfcs/blob/master/text/0019-opt-in-builtin-traits.md), [`conservative_impl_trait`](https://github.com/rust-lang/rfcs/blob/master/text/1522-conservative-impl-trait.md),
-and [simd](https://github.com/rust-lang/rust/issues/27731).
-
-
-To check out and run test, you can simply run :
+Tantivy requires Rust Nightly because it uses requires the features [`box_syntax`](https://doc.rust-lang.org/stable/book/box-syntax-and-patterns.html), [`optin_builtin_traits`](https://github.com/rust-lang/rfcs/blob/master/text/0019-opt-in-builtin-traits.md), and [`conservative_impl_trait`](https://github.com/rust-lang/rfcs/blob/master/text/1522-conservative-impl-trait.md).
+The project can then be built using `cargo`.

    git clone git@github.com:tantivy-search/tantivy.git
    cd tantivy
-    cargo +nightly build
+    cargo build


-## Note on release build and performance
-
-If your project depends on `tantivy`, for better performance, make sure to enable
-`sse3` instructions using a RUSTFLAGS. (This instruction set is likely to
-be available on most `x86_64` CPUs you will encounter).
-
-For instance,
-
-    RUSTFLAGS='-C target-feature=+sse3'
-
-Or, if you are targetting a specific cpu
-
-    RUSTFLAGS='-C target-cpu=native' build --release
-
-Regardless of the flags you pass, by default `tantivy` will contain `SSE3` instructions.
-If you want to disable those, you can run the following command :
-
-    cargo build --no-default-features
-
 Alternatively, if you are trying to compile `tantivy` without simd compression,
 you can disable this functionality. In this case, this submodule is not required
 and you can compile tantivy by using the `--no-default-features` flag.
@@ -82,4 +57,4 @@ and you can compile tantivy by using the `--no-default-features` flag.

 # Contribute

-Send me an email (paul.masurel at gmail.com) if you want to contribute to tantivy.
+Send me an email (paul.masurel at gmail.com) if you want to contribute to tantivy.
--- a/bin/convert_to_static.rs
+++ b/bin/convert_to_static.rs
@@ -1,20 +0,0 @@
-use std::env;
-use std::path::PathBuf;
-use std::fs::File;
-use std::io::Write;
-extern crate tantivy;
-use tantivy::directory::write_static_from_directory;
-
-fn main() {
-    // Prints each argument on a separate line
-    let  mut args = env::args();
-    args.next().unwrap();
-    let directory_path= args.next().expect("Expect 2 args.<directory_path> <outputfile>");
-    let output_path = args.next().expect("Expect 2 args.<directory_path> <outputfile>");
-    println!("{} => {}", directory_path, output_path);
-    let buffer = write_static_from_directory(&PathBuf::from(directory_path)).unwrap();
-    println!("Read all");
-    let mut output = File::create(output_path).unwrap();
-    output.write_all(&buffer[..]).unwrap();
-    output.flush().unwrap();
-}
--- a/bin/test_static_dir.rs
+++ b/bin/test_static_dir.rs
@@ -1,51 +0,0 @@
-use std::env;
-use std::path::PathBuf;
-use std::fs::File;
-use std::io::Write;
-extern crate tantivy;
-use tantivy::directory::{StaticDirectory, write_static_from_directory};
-use tantivy::Index;
-use tantivy::query::QueryParser;
-use tantivy::collector::TopCollector;
-
-
-static DATA: &'static [u8] = include_bytes!("output.bin");
-
-fn run() -> tantivy::Result<()> {
-    // Prints each argument on a separate line
-    let directory = StaticDirectory::open(DATA).unwrap();
-    let index = Index::open_directory(directory).unwrap();
-    index.load_searchers().unwrap();
-    let searcher = index.searcher();
-
-    let schema = index.schema();
-    let title = schema.get_field("title").unwrap();
-    let body = schema.get_field("body").unwrap();
-
-    let query_parser = QueryParser::for_index(&index, vec![title, body]);
-    let query = query_parser.parse_query("sea whale")?;
-
-    let mut top_collector = TopCollector::with_limit(10);
-
-    searcher.search(&*query, &mut top_collector)?;
-
-    let doc_addresses = top_collector.docs();
-
-    // The actual documents still need to be
-    // retrieved from Tantivy's store.
-    //
-    // Since the body field was not configured as stored,
-    // the document returned will only contain
-    // a title.
-
-    for doc_address in doc_addresses {
-        let retrieved_doc = searcher.doc(&doc_address)?;
-        println!("{}", schema.to_json(&retrieved_doc));
-    }
-    Ok(())
-}
-
-
-fn main() {
-    run().unwrap();
-}
--- a/build.rs
+++ b/build.rs
@@ -0,0 +1,61 @@
+#[cfg(feature = "simdcompression")]
+mod build {
+    extern crate cc;
+
+    pub fn build() {
+        let mut config = cc::Build::new();
+        config
+            .include("./cpp/simdcomp/include")
+            .file("cpp/simdcomp/src/avxbitpacking.c")
+            .file("cpp/simdcomp/src/simdintegratedbitpacking.c")
+            .file("cpp/simdcomp/src/simdbitpacking.c")
+            .file("cpp/simdcomp/src/simdpackedsearch.c")
+            .file("cpp/simdcomp/src/simdcomputil.c")
+            .file("cpp/simdcomp/src/simdpackedselect.c")
+            .file("cpp/simdcomp/src/simdfor.c")
+            .file("cpp/simdcomp_wrapper.c");
+
+        if !cfg!(debug_assertions) {
+            config.opt_level(3);
+
+            if cfg!(target_env = "msvc") {
+                config
+                    .define("NDEBUG", None)
+                    .flag("/Gm-")
+                    .flag("/GS-")
+                    .flag("/Gy")
+                    .flag("/Oi")
+                    .flag("/GL");
+            }
+        }
+
+        if !cfg!(target_env = "msvc") {
+            config
+                .include("./cpp/streamvbyte/include")
+                .file("cpp/streamvbyte/src/streamvbyte.c")
+                .file("cpp/streamvbyte/src/streamvbytedelta.c")
+                .flag("-msse4.1")
+                .flag("-march=native")
+                .flag("-std=c99");
+        }
+
+        config.compile("libsimdcomp.a");
+
+        // Workaround for linking static libraries built with /GL
+        // https://github.com/rust-lang/rust/issues/26003
+        if !cfg!(debug_assertions) && cfg!(target_env = "msvc") {
+            println!("cargo:rustc-link-lib=dylib=simdcomp");
+        }
+
+        println!("cargo:rerun-if-changed=cpp");
+    }
+}
+
+#[cfg(not(feature = "simdcompression"))]
+mod build {
+    pub fn build() {}
+}
+
+fn main() {
+    build::build();
+}
--- a/cpp/simdcomp/.gitignore
+++ b/cpp/simdcomp/.gitignore
@@ -0,0 +1,9 @@
+Makefile.in
+lib*
+unit*
+*.o
+src/*.lo
+src/*.o
+src/.deps
+src/.dirstamp
+src/.libs
--- a/cpp/simdcomp/.travis.yml
+++ b/cpp/simdcomp/.travis.yml
@@ -0,0 +1,11 @@
+language: c
+sudo: false
+compiler:
+  - gcc
+  - clang
+
+branches:
+  only:
+    - master
+
+script: make && ./unit
--- a/cpp/simdcomp/CHANGELOG
+++ b/cpp/simdcomp/CHANGELOG
@@ -0,0 +1,9 @@
+Upcoming
+  - added missing include
+  - improved portability (MSVC)
+  - implemented C89 compatibility
+Version 0.0.3 (19 May 2014)
+  - improved documentation
+Version 0.0.2 (6 February 2014)
+  - added go demo
+Version 0.0.1  (5 February 2014)
--- a/cpp/simdcomp/LICENSE
+++ b/cpp/simdcomp/LICENSE
@@ -0,0 +1,27 @@
+Copyright (c) 2014--, The authors
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without modification,
+are permitted provided that the following conditions are met:
+
+* Redistributions of source code must retain the above copyright notice, this
+  list of conditions and the following disclaimer.
+
+* Redistributions in binary form must reproduce the above copyright notice, this
+  list of conditions and the following disclaimer in the documentation and/or
+  other materials provided with the distribution.
+
+* Neither the name of the {organization} nor the names of its
+  contributors may be used to endorse or promote products derived from
+  this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR
+ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
+ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
--- a/cpp/simdcomp/README.md
+++ b/cpp/simdcomp/README.md
@@ -0,0 +1,137 @@
+The SIMDComp library
+====================
+[![Build Status](https://travis-ci.org/lemire/simdcomp.png)](https://travis-ci.org/lemire/simdcomp)
+
+A simple C library for compressing lists of integers using binary packing and SIMD instructions.
+The assumption is either that you have a list of 32-bit integers where most of them are small, or a list of 32-bit integers where differences between successive integers are small. No software is able to reliably compress an array of 32-bit random numbers.
+
+This library can decode at least 4 billions of compressed integers per second on most
+desktop or laptop processors. That is, it can decompress data at a rate of 15 GB/s.
+This is significantly faster than generic codecs like gzip, LZO, Snappy or LZ4.
+
+On a Skylake Intel processor, it can decode integers at a rate 0.3 cycles per integer,
+which can easily translate into more than 8 decoded billions integers per second.
+
+Contributors: Daniel Lemire, Nathan Kurz, Christoph Rupp, Anatol Belski, Nick White and others
+
+What is it for?
+-------------
+
+This is a low-level library for fast integer compression. By design it does not define a compressed
+format. It is up to the (sophisticated) user to create a compressed format.
+
+Requirements
+-------------
+
+- Your processor should support SSE4.1 (It is supported by most Intel and AMD processors released since 2008.)
+- It is possible to build the core part of the code if your processor support SSE2 (Pentium4 or better)
+- C99 compliant compiler (GCC is assumed)
+- A Linux-like distribution is assumed by the makefile
+
+For a plain C version that does not use SIMD instructions, see https://github.com/lemire/LittleIntPacker
+
+Usage
+-------
+
+Compression works over blocks of 128 integers.
+
+For a complete working example, see example.c (you can build it and
+run it with "make example; ./example").
+
+
+
+1) Lists of integers in random order.
+
+```C            
+const uint32_t b = maxbits(datain);// computes bit width
+simdpackwithoutmask(datain, buffer, b);//compressed to buffer, compressing 128 32-bit integers down to b*32 bytes
+simdunpack(buffer, backbuffer, b);//uncompressed to backbuffer
+```
+
+While 128 32-bit integers are read, only b 128-bit words are written. Thus, the compression ratio is 32/b.
+
+2) Sorted lists of integers.
+
+We used differential coding: we store the difference between successive integers. For this purpose, we need an initial value (called offset).
+
+```C            
+uint32_t offset = 0;
+uint32_t b1 = simdmaxbitsd1(offset,datain); // bit width
+simdpackwithoutmaskd1(offset, datain, buffer, b1);//compressing 128 32-bit integers down to b1*32 bytes
+simdunpackd1(offset, buffer, backbuffer, b1);//uncompressed
+```
+
+General example for arrays of arbitrary length:
+```C
+int compress_decompress_demo() {
+  size_t k, N = 9999;
+  __m128i * endofbuf;
+  uint32_t * datain = malloc(N * sizeof(uint32_t));
+  uint8_t * buffer;
+  uint32_t * backbuffer = malloc(N * sizeof(uint32_t));
+  uint32_t b;
+
+  for (k = 0; k < N; ++k){        /* start with k=0, not k=1! */
+    datain[k] = k;
+  }
+
+  b = maxbits_length(datain, N);
+  buffer = malloc(simdpack_compressedbytes(N,b)); // allocate just enough memory
+  endofbuf = simdpack_length(datain, N, (__m128i *)buffer, b);
+  /* compressed data is stored between buffer and endofbuf using (endofbuf-buffer)*sizeof(__m128i) bytes */
+  /* would be safe to do : buffer = realloc(buffer,(endofbuf-(__m128i *)buffer)*sizeof(__m128i)); */
+  simdunpack_length((const __m128i *)buffer, N, backbuffer, b);
+
+  for (k = 0; k < N; ++k){
+    if(datain[k] != backbuffer[k]) {
+      printf("bug\n");
+      return -1;
+    }
+  }
+  return 0;
+}
+```
+
+
+3) Frame-of-Reference 
+
+We also have frame-of-reference (FOR) functions (see simdfor.h header). They work like the bit packing
+routines, but do not use differential coding so they allow faster search in some cases, at the expense
+of compression.
+
+Setup
+---------
+
+
+make
+make test
+
+and if you are daring:
+
+make install
+
+Go
+--------
+
+If you are a go user, there is a "go" folder where you will find a simple demo.
+
+Other libraries
+----------------
+
+* Fast decoder for VByte-compressed integers https://github.com/lemire/MaskedVByte
+* Fast integer compression in C using StreamVByte https://github.com/lemire/streamvbyte
+* FastPFOR is a C++ research library well suited to compress unsorted arrays: https://github.com/lemire/FastPFor
+* SIMDCompressionAndIntersection is a C++ research library well suited for sorted arrays (differential coding)
+and computing intersections: https://github.com/lemire/SIMDCompressionAndIntersection
+* TurboPFor is a C library that offers lots of interesting optimizations. Well worth checking! (GPL license) https://github.com/powturbo/TurboPFor
+* Oroch is a C++ library that offers a usable API (MIT license) https://github.com/ademakov/Oroch
+
+
+References
+------------
+
+* Daniel Lemire, Leonid Boytsov, Nathan Kurz, SIMD Compression and the Intersection of Sorted Integers, Software Practice & Experience 46 (6) 2016. http://arxiv.org/abs/1401.6399
+* Daniel Lemire and Leonid Boytsov, Decoding billions of integers per second through vectorization, Software Practice & Experience 45 (1), 2015.  http://arxiv.org/abs/1209.2137 http://onlinelibrary.wiley.com/doi/10.1002/spe.2203/abstract
+* Jeff Plaisance, Nathan Kurz, Daniel Lemire, Vectorized VByte Decoding, International Symposium on Web Algorithms 2015, 2015. http://arxiv.org/abs/1503.07387
+* Wayne Xin Zhao, Xudong Zhang, Daniel Lemire, Dongdong Shan, Jian-Yun Nie, Hongfei Yan, Ji-Rong Wen, A General SIMD-based Approach to Accelerating Compression Algorithms, ACM Transactions on Information Systems 33 (3), 2015. http://arxiv.org/abs/1502.01916
+* T. D. Wu, Bitpacking techniques for indexing genomes: I. Hash tables, Algorithms for Molecular Biology 11 (5), 2016. http://almob.biomedcentral.com/articles/10.1186/s13015-016-0069-5
--- a/cpp/simdcomp/benchmarks/benchmark.c
+++ b/cpp/simdcomp/benchmarks/benchmark.c
@@ -0,0 +1,235 @@
+/**
+ * This code is released under a BSD License.
+ */
+#include <assert.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <time.h>
+
+#include "simdcomp.h"
+
+#ifdef _MSC_VER
+# include <windows.h>
+
+__int64 freq;
+
+typedef __int64 time_snap_t;
+
+static time_snap_t time_snap(void)
+{
+	__int64 now;
+
+	QueryPerformanceCounter((LARGE_INTEGER *)&now);
+
+	return (__int64)((now*1000000)/freq);
+}
+# define TIME_SNAP_FMT "%I64d"
+#else
+# define time_snap clock
+# define TIME_SNAP_FMT "%lu"
+typedef clock_t time_snap_t;
+#endif
+
+
+void benchmarkSelect() {
+    uint32_t buffer[128];
+    uint32_t backbuffer[128];
+    uint32_t initial = 33;
+    uint32_t b;
+    time_snap_t S1, S2, S3;
+    int i;
+    printf("benchmarking select \n");
+
+    /* this test creates delta encoded buffers with different bits, then
+     * performs lower bound searches for each key */
+    for (b = 0; b <= 32; b++) {
+        uint32_t prev = initial;
+        uint32_t out[128];
+        /* initialize the buffer */
+        for (i = 0; i < 128; i++) {
+            buffer[i] =  ((uint32_t)(1655765 * i )) ;
+            if(b < 32) buffer[i] %= (1<<b);
+        }
+        for (i = 0; i < 128; i++) {
+            buffer[i] = buffer[i] + prev;
+            prev = buffer[i];
+        }
+
+        for (i = 1; i < 128; i++) {
+            if(buffer[i] < buffer[i-1] )
+                buffer[i] = buffer[i-1];
+        }
+        assert(simdmaxbitsd1(initial, buffer)<=b);
+
+        for (i = 0; i < 128; i++) {
+            out[i] = 0; /* memset would do too */
+        }
+
+        /* delta-encode to 'i' bits */
+        simdpackwithoutmaskd1(initial, buffer, (__m128i *)out, b);
+
+        S1 = time_snap();
+        for (i = 0; i < 128 * 10; i++) {
+            uint32_t valretrieved = simdselectd1(initial, (__m128i *)out, b, (uint32_t)i % 128);
+            assert(valretrieved == buffer[i%128]);
+        }
+        S2 = time_snap();
+        for (i = 0; i < 128 * 10; i++) {
+            simdunpackd1(initial,  (__m128i *)out, backbuffer, b);
+            assert(backbuffer[i % 128] == buffer[i % 128]);
+        }
+        S3 = time_snap();
+        printf("bit width = %d, fast select function time = " TIME_SNAP_FMT ", naive time = " TIME_SNAP_FMT "  \n", b, (S2-S1), (S3-S2));
+    }
+}
+
+int uint32_cmp(const void *a, const void *b)
+{
+    const uint32_t *ia = (const uint32_t *)a;
+    const uint32_t *ib = (const uint32_t *)b;
+    if(*ia < *ib)
+        return -1;
+    else if (*ia > *ib)
+        return 1;
+    return 0;
+}
+
+/* adapted from wikipedia */
+int binary_search(uint32_t * A, uint32_t key, int imin, int imax)
+{
+    int imid;
+    imax --;
+    while(imin + 1 < imax) {
+        imid = imin + ((imax - imin) / 2);
+
+        if (A[imid] > key) {
+            imax = imid;
+        } else if (A[imid] < key) {
+            imin = imid;
+        } else {
+            return imid;
+        }
+    }
+    return imax;
+}
+
+
+/* adapted from wikipedia */
+int lower_bound(uint32_t * A, uint32_t key, int imin, int imax)
+{
+    int imid;
+    imax --;
+    while(imin + 1 < imax) {
+        imid = imin + ((imax - imin) / 2);
+
+        if (A[imid] >= key) {
+            imax = imid;
+        } else if (A[imid] < key) {
+            imin = imid;
+        }
+    }
+    if(A[imin] >= key) return imin;
+    return imax;
+}
+
+void benchmarkSearch() {
+    uint32_t buffer[128];
+    uint32_t backbuffer[128];
+    uint32_t out[128];
+    uint32_t result, initial = 0;
+    uint32_t b, i;
+    time_snap_t S1, S2, S3, S4;
+
+    printf("benchmarking search \n");
+
+    /* this test creates delta encoded buffers with different bits, then
+     * performs lower bound searches for each key */
+    for (b = 0; b <= 32; b++) {
+        uint32_t prev = initial;
+        /* initialize the buffer */
+        for (i = 0; i < 128; i++) {
+            buffer[i] =  ((uint32_t)rand()) ;
+            if(b < 32) buffer[i] %= (1<<b);
+        }
+
+        qsort(buffer,128, sizeof(uint32_t), uint32_cmp);
+
+        for (i = 0; i < 128; i++) {
+            buffer[i] = buffer[i] + prev;
+            prev = buffer[i];
+        }
+        for (i = 1; i < 128; i++) {
+            if(buffer[i] < buffer[i-1] )
+                buffer[i] = buffer[i-1];
+        }
+        assert(simdmaxbitsd1(initial, buffer)<=b);
+        for (i = 0; i < 128; i++) {
+            out[i] = 0; /* memset would do too */
+        }
+
+        /* delta-encode to 'i' bits */
+        simdpackwithoutmaskd1(initial, buffer, (__m128i *)out, b);
+        simdunpackd1(initial,  (__m128i *)out, backbuffer, b);
+
+        for (i = 0; i < 128; i++) {
+            assert(buffer[i] == backbuffer[i]);
+         }
+        S1 = time_snap();
+        for (i = 0; i < 128 * 10; i++) {
+
+            int pos;
+            uint32_t pseudorandomkey  =  buffer[i%128];
+            __m128i vecinitial = _mm_set1_epi32(initial);
+            pos = simdsearchd1(&vecinitial, (__m128i *)out, b,
+                               pseudorandomkey, &result);
+            if((result < pseudorandomkey) || (buffer[pos] != result)) {
+                printf("bug A.\n");
+            } else if (pos > 0) {
+                if(buffer[pos-1] >= pseudorandomkey)
+                    printf("bug B.\n");
+            }
+        }
+        S2 = time_snap();
+        for (i = 0; i < 128 * 10; i++) {
+            int pos;
+            uint32_t pseudorandomkey  =  buffer[i%128];
+            simdunpackd1(initial,  (__m128i *)out, backbuffer, b);
+            pos =  lower_bound(backbuffer, pseudorandomkey, 0, 128);
+            result = backbuffer[pos];
+
+            if((result < pseudorandomkey) || (buffer[pos] != result)) {
+                printf("bug C.\n");
+            } else if (pos > 0) {
+                if(buffer[pos-1] >= pseudorandomkey)
+                    printf("bug D.\n");
+            }
+        }
+        S3 = time_snap();
+        for (i = 0; i < 128 * 10; i++) {
+
+            int pos;
+            uint32_t pseudorandomkey  =  buffer[i%128];
+            pos = simdsearchwithlengthd1(initial, (__m128i *)out, b, 128,
+                               pseudorandomkey, &result);
+            if((result < pseudorandomkey) || (buffer[pos] != result)) {
+                printf("bug A.\n");
+            } else if (pos > 0) {
+                if(buffer[pos-1] >= pseudorandomkey)
+                    printf("bug B.\n");
+            }
+        }
+        S4 = time_snap();
+
+        printf("bit width = %d, fast search function time = " TIME_SNAP_FMT ", naive time = " TIME_SNAP_FMT " , fast with length time = " TIME_SNAP_FMT "  \n", b, (S2-S1), (S3-S2), (S4-S3) );
+    }
+}
+
+
+int main() {
+#ifdef _MSC_VER
+    QueryPerformanceFrequency((LARGE_INTEGER *)&freq);
+#endif
+    benchmarkSearch();
+    benchmarkSelect();
+    return 0;
+}
--- a/cpp/simdcomp/benchmarks/bitpackingbenchmark.c
+++ b/cpp/simdcomp/benchmarks/bitpackingbenchmark.c
@@ -0,0 +1,205 @@
+#include <stdio.h>
+
+#include "simdcomp.h"
+
+
+#define RDTSC_START(cycles)                                                   \
+    do {                                                                      \
+        register unsigned cyc_high, cyc_low;                                  \
+        __asm volatile(                                                       \
+            "cpuid\n\t"                                                       \
+            "rdtsc\n\t"                                                       \
+            "mov %%edx, %0\n\t"                                               \
+            "mov %%eax, %1\n\t"                                               \
+            : "=r"(cyc_high), "=r"(cyc_low)::"%rax", "%rbx", "%rcx", "%rdx"); \
+        (cycles) = ((uint64_t)cyc_high << 32) | cyc_low;                      \
+    } while (0)
+
+#define RDTSC_FINAL(cycles)                                                   \
+    do {                                                                      \
+        register unsigned cyc_high, cyc_low;                                  \
+        __asm volatile(                                                       \
+            "rdtscp\n\t"                                                      \
+            "mov %%edx, %0\n\t"                                               \
+            "mov %%eax, %1\n\t"                                               \
+            "cpuid\n\t"                                                       \
+            : "=r"(cyc_high), "=r"(cyc_low)::"%rax", "%rbx", "%rcx", "%rdx"); \
+        (cycles) = ((uint64_t)cyc_high << 32) | cyc_low;                      \
+    } while (0)
+
+
+
+
+uint32_t * get_random_array_from_bit_width(uint32_t length, uint32_t bit) {
+    uint32_t * answer = malloc(sizeof(uint32_t) * length);
+    uint32_t mask = (uint32_t) ((UINT64_C(1) << bit) - 1);
+    uint32_t i;
+    for(i = 0; i < length; ++i) {
+        answer[i] = rand() & mask;
+    }
+    return answer;
+}
+
+uint32_t * get_random_array_from_bit_width_d1(uint32_t length, uint32_t bit) {
+    uint32_t * answer = malloc(sizeof(uint32_t) * length);
+    uint32_t mask = (uint32_t) ((UINT64_C(1) << bit) - 1);
+    uint32_t i;
+    answer[0] = rand() & mask;
+    for(i = 1; i < length; ++i) {
+        answer[i] = answer[i-1] + (rand() & mask);
+    }
+    return answer;
+}
+
+
+void demo128() {
+    const uint32_t length = 128;
+    uint32_t bit;
+    printf("# --- %s\n", __func__);
+    printf("# compressing %d integers\n",length);
+    printf("# format: bit width, pack in cycles per int, unpack in cycles per int\n");
+    for(bit = 1; bit <= 32; ++bit) {
+        uint32_t i;
+
+        uint32_t * data = get_random_array_from_bit_width(length, bit);
+        __m128i * buffer = malloc(length * sizeof(uint32_t));
+        uint32_t * backdata = malloc(length * sizeof(uint32_t));
+        uint32_t repeat = 500;
+        uint64_t min_diff;
+        printf("%d\t",bit);
+        min_diff = (uint64_t)-1;
+        for (i = 0; i < repeat; i++) {
+            uint64_t cycles_start, cycles_final, cycles_diff;
+            __asm volatile("" ::: /* pretend to clobber */ "memory");
+            RDTSC_START(cycles_start);
+            simdpackwithoutmask(data,buffer, bit);
+            RDTSC_FINAL(cycles_final);
+            cycles_diff = (cycles_final - cycles_start);
+            if (cycles_diff < min_diff) min_diff = cycles_diff;
+        }
+        printf("%.2f\t",min_diff*1.0/length);
+        min_diff = (uint64_t)-1;
+        for (i = 0; i < repeat; i++) {
+            uint64_t cycles_start, cycles_final, cycles_diff;
+            __asm volatile("" ::: /* pretend to clobber */ "memory");
+            RDTSC_START(cycles_start);
+            simdunpack(buffer, backdata,bit);
+            RDTSC_FINAL(cycles_final);
+            cycles_diff = (cycles_final - cycles_start);
+            if (cycles_diff < min_diff) min_diff = cycles_diff;
+        }
+        printf("%.2f\t",min_diff*1.0/length);
+
+        free(data);
+        free(buffer);
+        free(backdata);
+        printf("\n");
+    }
+    printf("\n\n"); /* two blank lines are required by gnuplot */
+}
+
+void demo128_d1() {
+    const uint32_t length = 128;
+    uint32_t bit;
+    printf("# --- %s\n", __func__);
+    printf("# compressing %d integers\n",length);
+    printf("# format: bit width, pack in cycles per int, unpack in cycles per int\n");
+    for(bit = 1; bit <= 32; ++bit) {
+        uint32_t i;
+
+        uint32_t * data = get_random_array_from_bit_width_d1(length, bit);
+        __m128i * buffer = malloc(length * sizeof(uint32_t));
+        uint32_t * backdata = malloc(length * sizeof(uint32_t));
+        uint32_t repeat = 500;
+        uint64_t min_diff;
+        printf("%d\t",bit);
+        min_diff = (uint64_t)-1;
+        for (i = 0; i < repeat; i++) {
+            uint64_t cycles_start, cycles_final, cycles_diff;
+            __asm volatile("" ::: /* pretend to clobber */ "memory");
+            RDTSC_START(cycles_start);
+            simdpackwithoutmaskd1(0,data,buffer, bit);
+            RDTSC_FINAL(cycles_final);
+            cycles_diff = (cycles_final - cycles_start);
+            if (cycles_diff < min_diff) min_diff = cycles_diff;
+        }
+        printf("%.2f\t",min_diff*1.0/length);
+        min_diff = (uint64_t)-1;
+        for (i = 0; i < repeat; i++) {
+            uint64_t cycles_start, cycles_final, cycles_diff;
+            __asm volatile("" ::: /* pretend to clobber */ "memory");
+            RDTSC_START(cycles_start);
+            simdunpackd1(0,buffer, backdata,bit);
+            RDTSC_FINAL(cycles_final);
+            cycles_diff = (cycles_final - cycles_start);
+            if (cycles_diff < min_diff) min_diff = cycles_diff;
+        }
+        printf("%.2f\t",min_diff*1.0/length);
+
+        free(data);
+        free(buffer);
+        free(backdata);
+        printf("\n");
+    }
+    printf("\n\n"); /* two blank lines are required by gnuplot */
+}
+
+#ifdef __AVX2__
+void demo256() {
+    const uint32_t length = 256;
+    uint32_t bit;
+    printf("# --- %s\n", __func__);
+    printf("# compressing %d integers\n",length);
+    printf("# format: bit width, pack in cycles per int, unpack in cycles per int\n");
+    for(bit = 1; bit <= 32; ++bit) {
+        uint32_t i;
+
+        uint32_t * data = get_random_array_from_bit_width(length, bit);
+        __m256i * buffer = malloc(length * sizeof(uint32_t));
+        uint32_t * backdata = malloc(length * sizeof(uint32_t));
+        uint32_t repeat = 500;
+        uint64_t min_diff;
+        printf("%d\t",bit);
+        min_diff = (uint64_t)-1;
+        for (i = 0; i < repeat; i++) {
+            uint64_t cycles_start, cycles_final, cycles_diff;
+            __asm volatile("" ::: /* pretend to clobber */ "memory");
+            RDTSC_START(cycles_start);
+            avxpackwithoutmask(data,buffer, bit);
+            RDTSC_FINAL(cycles_final);
+            cycles_diff = (cycles_final - cycles_start);
+            if (cycles_diff < min_diff) min_diff = cycles_diff;
+        }
+        printf("%.2f\t",min_diff*1.0/length);
+        min_diff = (uint64_t)-1;
+        for (i = 0; i < repeat; i++) {
+            uint64_t cycles_start, cycles_final, cycles_diff;
+            __asm volatile("" ::: /* pretend to clobber */ "memory");
+            RDTSC_START(cycles_start);
+            avxunpack(buffer, backdata,bit);
+            RDTSC_FINAL(cycles_final);
+            cycles_diff = (cycles_final - cycles_start);
+            if (cycles_diff < min_diff) min_diff = cycles_diff;
+        }
+        printf("%.2f\t",min_diff*1.0/length);
+
+        free(data);
+        free(buffer);
+        free(backdata);
+        printf("\n");
+    }
+    printf("\n\n"); /* two blank lines are required by gnuplot */
+}
+#endif /* avx 2 */
+
+
+int main() {
+    demo128();
+    demo128_d1();
+#ifdef __AVX2__
+    demo256();
+#endif
+    return 0;
+
+
+}
--- a/cpp/simdcomp/example.c
+++ b/cpp/simdcomp/example.c
@@ -0,0 +1,195 @@
+/* Type "make example" to build this example program. */
+#include <stdio.h>
+#include <time.h>
+#include <stdlib.h>
+#include "simdcomp.h"
+
+/**
+We provide several different code examples.
+**/
+
+
+/* very simple test to illustrate a simple application */
+int compress_decompress_demo() {
+    size_t k, N = 9999;
+    __m128i * endofbuf;
+    int howmanybytes;
+    float compratio;
+    uint32_t * datain = malloc(N * sizeof(uint32_t));
+    uint8_t * buffer;
+    uint32_t * backbuffer = malloc(N * sizeof(uint32_t));
+    uint32_t b;
+    printf("== simple test\n");
+
+    for (k = 0; k < N; ++k) {       /* start with k=0, not k=1! */
+        datain[k] = k;
+    }
+
+    b = maxbits_length(datain, N);
+    buffer = malloc(simdpack_compressedbytes(N,b));
+    endofbuf = simdpack_length(datain, N, (__m128i *)buffer, b);
+    howmanybytes = (endofbuf-(__m128i *)buffer)*sizeof(__m128i); /* number of compressed bytes */
+    compratio = N*sizeof(uint32_t) * 1.0 / howmanybytes;
+    /* endofbuf points to the end of the compressed data */
+    buffer = realloc(buffer,(endofbuf-(__m128i *)buffer)*sizeof(__m128i)); /* optional but safe. */
+    printf("Compressed %d integers down to %d bytes (comp. ratio = %f).\n",(int)N,howmanybytes,compratio);
+    /* in actual applications b must be stored and retrieved: caller is responsible for that. */
+    simdunpack_length((const __m128i *)buffer, N, backbuffer, b); /* will return a pointer to endofbuf */ 
+
+    for (k = 0; k < N; ++k) {
+        if(datain[k] != backbuffer[k]) {
+            printf("bug at %lu \n",(unsigned long)k);
+            return -1;
+        }
+    }
+    printf("Code works!\n");
+    free(datain);
+    free(buffer);
+    free(backbuffer);
+    return 0;
+}
+
+
+
+/* compresses data from datain to buffer, returns how many bytes written
+used below in simple_demo */
+size_t compress(uint32_t * datain, size_t length, uint8_t * buffer) {
+    uint32_t offset;
+    uint8_t * initout;
+    size_t k;
+    if(length/SIMDBlockSize*SIMDBlockSize != length) {
+        printf("Data length should be a multiple of %i \n",SIMDBlockSize);
+    }
+    offset = 0;
+    initout = buffer;
+    for(k = 0; k < length / SIMDBlockSize; ++k) {
+        uint32_t b = simdmaxbitsd1(offset,
+                                   datain + k * SIMDBlockSize);
+        *buffer++ = b;
+        simdpackwithoutmaskd1(offset, datain + k * SIMDBlockSize, (__m128i *) buffer,
+                              b);
+        offset = datain[k * SIMDBlockSize + SIMDBlockSize - 1];
+        buffer += b * sizeof(__m128i);
+    }
+    return buffer - initout;
+}
+
+/* Another illustration ... */
+void simple_demo() {
+    size_t REPEAT = 10, gap;
+    size_t N = 1000 * SIMDBlockSize;/* SIMDBlockSize is 128 */
+    uint32_t * datain = malloc(N * sizeof(uint32_t));
+    size_t compsize;
+    clock_t start, end;
+    uint8_t * buffer = malloc(N * sizeof(uint32_t) + N / SIMDBlockSize); /* output buffer */
+    uint32_t * backbuffer = malloc(SIMDBlockSize * sizeof(uint32_t));
+    printf("== simple demo\n");
+    for (gap = 1; gap <= 243; gap *= 3) {
+        size_t k, repeat;
+        uint32_t offset = 0;
+        uint32_t bogus = 0;
+        double numberofseconds;
+
+        printf("\n");
+        printf(" gap = %lu \n", (unsigned long) gap);
+        datain[0] = 0;
+        for (k = 1; k < N; ++k)
+            datain[k] = datain[k-1] + ( rand() % (gap + 1) );
+        compsize = compress(datain,N,buffer);
+        printf("compression ratio = %f \n",  (N * sizeof(uint32_t))/ (compsize * 1.0 ));
+        start = clock();
+        for(repeat = 0; repeat < REPEAT; ++repeat) {
+            uint8_t * decbuffer = buffer;
+            for (k = 0; k * SIMDBlockSize < N; ++k) {
+                uint8_t b = *decbuffer++;
+                simdunpackd1(offset, (__m128i *) decbuffer, backbuffer, b);
+                /* do something here with backbuffer */
+                bogus += backbuffer[3];
+                decbuffer += b * sizeof(__m128i);
+                offset = backbuffer[SIMDBlockSize - 1];
+            }
+        }
+        end = clock();
+        numberofseconds = (end-start)/(double)CLOCKS_PER_SEC;
+        printf("decoding speed in million of integers per second %f \n",N*REPEAT/(numberofseconds*1000.0*1000.0));
+        start = clock();
+        for(repeat = 0; repeat < REPEAT; ++repeat) {
+            uint8_t * decbuffer = buffer;
+            for (k = 0; k * SIMDBlockSize < N; ++k) {
+                memcpy(backbuffer,decbuffer+k*SIMDBlockSize,SIMDBlockSize*sizeof(uint32_t));
+                bogus += backbuffer[3] - backbuffer[100];
+            }
+        }
+        end = clock();
+        numberofseconds = (end-start)/(double)CLOCKS_PER_SEC;
+        printf("memcpy speed in million of integers per second %f \n",N*REPEAT/(numberofseconds*1000.0*1000.0));
+        printf("ignore me %i \n",bogus);
+        printf("All tests are in CPU cache. Avoid out-of-cache decoding in applications.\n");
+    }
+    free(buffer);
+    free(datain);
+    free(backbuffer);
+}
+
+/* Used below in more_sophisticated_demo ... */
+size_t varying_bit_width_compress(uint32_t * datain, size_t length, uint8_t * buffer) {
+    uint8_t * initout;
+    size_t k;
+    if(length/SIMDBlockSize*SIMDBlockSize != length) {
+        printf("Data length should be a multiple of %i \n",SIMDBlockSize);
+    }
+    initout = buffer;
+    for(k = 0; k < length / SIMDBlockSize; ++k) {
+        uint32_t b = maxbits(datain);
+        *buffer++ = b;
+        simdpackwithoutmask(datain, (__m128i *)buffer, b);
+        datain += SIMDBlockSize;
+        buffer += b * sizeof(__m128i);
+    }
+    return buffer - initout;
+}
+
+/* Here we compress the data in blocks of 128 integers with varying bit width */
+int varying_bit_width_demo() {
+    size_t nn = 128 * 2;
+    uint32_t * datainn = malloc(nn * sizeof(uint32_t));
+    uint8_t * buffern = malloc(nn * sizeof(uint32_t) + nn / SIMDBlockSize);
+    uint8_t * initbuffern = buffern;
+    uint32_t * backbuffern = malloc(nn * sizeof(uint32_t));
+    size_t k, compsize;
+    printf("== varying bit-width demo\n");
+
+    for(k=0; k<nn; ++k) {
+        datainn[k] = rand() % (k + 1);
+    }
+
+    compsize = varying_bit_width_compress(datainn,nn,buffern);
+    printf("encoded size: %u (original size: %u)\n", (unsigned)compsize,
+           (unsigned)(nn * sizeof(uint32_t)));
+
+    for (k = 0; k * SIMDBlockSize < nn; ++k) {
+        uint32_t b = *buffern;
+        buffern++;
+        simdunpack((const __m128i *)buffern, backbuffern + k * SIMDBlockSize, b);
+        buffern += b * sizeof(__m128i);
+    }
+
+    for (k = 0; k < nn; ++k) {
+        if(backbuffern[k] != datainn[k]) {
+            printf("bug\n");
+            return -1;
+        }
+    }
+    printf("Code works!\n");
+    free(datainn);
+    free(initbuffern);
+    free(backbuffern);
+    return 0;
+}
+
+int main() {
+    if(compress_decompress_demo() != 0) return -1;
+    if(varying_bit_width_demo() != 0) return -1;
+    simple_demo();
+    return 0;
+}
--- a/cpp/simdcomp/go/README.md
+++ b/cpp/simdcomp/go/README.md
@@ -0,0 +1,13 @@
+Simple Go demo
+==============
+
+Setup
+======
+
+Start by installing the simdcomp library (make && make install).
+
+Then type:
+
+go run test.go
+
+
--- a/cpp/simdcomp/go/test.go
+++ b/cpp/simdcomp/go/test.go
@@ -0,0 +1,71 @@
+/////////
+// This particular file is in the public domain.
+// Author: Daniel Lemire
+////////
+
+package main 
+
+/*
+#cgo LDFLAGS: -lsimdcomp
+#include <simdcomp.h>
+*/
+import "C"
+import "fmt"
+
+//////////
+// For this demo, we pack and unpack blocks of 128 integers
+/////////
+func main() {
+        // I am going to use C types. Alternative might be to use unsafe.Pointer calls, see http://bit.ly/1ndw3W3
+        // this is our original data
+        var data [128]C.uint32_t
+        for i := C.uint32_t(0); i < C.uint32_t(128); i++ {
+            data[i] = i
+        }
+
+
+
+
+
+        ////////////
+        // We first pack without differential coding
+        ///////////
+        // computing how many bits per int. is needed
+        b  := C.maxbits(&data[0])
+        ratio := 32.0/float64(b)
+        fmt.Println("Bit width  ", b)
+        fmt.Println(fmt.Sprintf("Compression ratio %f ", ratio))
+         // we are now going to create a buffer to receive the packed data (each __m128i uses 128 bits)
+        out := make([] C.__m128i,b)       
+        C.simdpackwithoutmask( &data[0],&out[0],b);
+        var recovereddata [128]C.uint32_t
+        C.simdunpack(&out[0],&recovereddata[0],b)
+        for i := 0; i < 128; i++ {
+            if data[i] != recovereddata[i]  {
+                  fmt.Println("Bug ")
+                  return
+            }
+        } 
+
+        ///////////
+        // Next, we use differential coding
+        //////////
+        offset := C.uint32_t(0) // if you pack data from K to K + 128, offset should be the value at K-1. When K = 0, choose a default
+        b1  := C.simdmaxbitsd1(offset,&data[0])
+        ratio1 := 32.0/float64(b1)
+        fmt.Println("Bit width  ", b1)
+        fmt.Println(fmt.Sprintf("Compression ratio %f ", ratio1))
+         // we are now going to create a buffer to receive the packed data (each __m128i uses 128 bits)
+        out = make([] C.__m128i,b1)       
+        C.simdpackwithoutmaskd1(offset, &data[0],&out[0],b1);
+        C.simdunpackd1(offset,&out[0],&recovereddata[0],b1)
+        for i := 0; i < 128; i++ {
+            if data[i] != recovereddata[i]  {
+                  fmt.Println("Bug ")
+                  return
+            }
+        } 
+
+        fmt.Println("test succesful.")
+      
+}
--- a/cpp/simdcomp/include/avxbitpacking.h
+++ b/cpp/simdcomp/include/avxbitpacking.h
@@ -0,0 +1,40 @@
+/**
+ * This code is released under a BSD License.
+ */
+
+#ifndef INCLUDE_AVXBITPACKING_H_
+#define INCLUDE_AVXBITPACKING_H_
+
+
+#ifdef __AVX2__
+
+#include "portability.h"
+
+
+/* AVX2 is required */
+#include <immintrin.h>
+/* for memset */
+#include <string.h>
+
+#include "simdcomputil.h"
+
+enum{ AVXBlockSize = 256};
+
+/* max integer logarithm over a range of AVXBlockSize integers (256 integer) */
+uint32_t avxmaxbits(const uint32_t * begin);
+
+/* reads 256 values from "in", writes  "bit" 256-bit vectors to "out" */
+void avxpack(const uint32_t *  in,__m256i *  out, const uint32_t bit);
+
+/* reads 256 values from "in", writes  "bit" 256-bit vectors to "out" */
+void avxpackwithoutmask(const uint32_t *  in,__m256i *  out, const uint32_t bit);
+
+/* reads  "bit" 256-bit vectors from "in", writes  256 values to "out" */
+void avxunpack(const __m256i *  in,uint32_t *  out, const uint32_t bit);
+
+
+
+
+#endif /* __AVX2__ */
+
+#endif /* INCLUDE_AVXBITPACKING_H_ */
--- a/cpp/simdcomp/include/portability.h
+++ b/cpp/simdcomp/include/portability.h
@@ -0,0 +1,81 @@
+/**
+ * This code is released under a BSD License.
+ */
+#ifndef SIMDBITCOMPAT_H_
+#define SIMDBITCOMPAT_H_
+
+#include <iso646.h> /* mostly for Microsoft compilers */
+#include <string.h>
+
+#if SIMDCOMP_DEBUG
+# define SIMDCOMP_ALWAYS_INLINE inline
+# define SIMDCOMP_NEVER_INLINE
+# define SIMDCOMP_PURE
+#else
+# if defined(__GNUC__)
+#  if __GNUC__ >= 3
+#   define SIMDCOMP_ALWAYS_INLINE inline __attribute__((always_inline))
+#   define SIMDCOMP_NEVER_INLINE __attribute__((noinline))
+#   define SIMDCOMP_PURE __attribute__((pure))
+#  else
+#   define SIMDCOMP_ALWAYS_INLINE inline
+#   define SIMDCOMP_NEVER_INLINE
+#   define SIMDCOMP_PURE
+#  endif
+# elif defined(_MSC_VER)
+#  define SIMDCOMP_ALWAYS_INLINE __forceinline
+#  define SIMDCOMP_NEVER_INLINE
+#  define SIMDCOMP_PURE
+# else
+#  if __has_attribute(always_inline)
+#   define SIMDCOMP_ALWAYS_INLINE inline __attribute__((always_inline))
+#  else
+#   define SIMDCOMP_ALWAYS_INLINE inline
+#  endif
+#  if __has_attribute(noinline)
+#   define SIMDCOMP_NEVER_INLINE __attribute__((noinline))
+#  else
+#   define SIMDCOMP_NEVER_INLINE
+#  endif
+#  if __has_attribute(pure)
+#   define SIMDCOMP_PURE __attribute__((pure))
+#  else
+#   define SIMDCOMP_PURE
+#  endif
+# endif
+#endif
+
+#if defined(_MSC_VER) && _MSC_VER < 1600
+typedef unsigned int uint32_t;
+typedef unsigned char uint8_t;
+typedef signed char int8_t;
+#else
+#include <stdint.h> /* part of Visual Studio 2010 and better, others likely anyway */
+#endif
+
+#if defined(_MSC_VER)
+#define SIMDCOMP_ALIGNED(x) __declspec(align(x))
+#else
+#if defined(__GNUC__)
+#define SIMDCOMP_ALIGNED(x) __attribute__ ((aligned(x)))
+#endif
+#endif
+
+#if defined(_MSC_VER)
+# include <intrin.h>
+/* 64-bit needs extending */
+# define SIMDCOMP_CTZ(result, mask) do { \
+		unsigned long index; \
+		if (!_BitScanForward(&(index), (mask))) { \
+			(result) = 32U; \
+		} else { \
+			(result) = (uint32_t)(index); \
+		} \
+	} while (0)
+#else
+# define SIMDCOMP_CTZ(result, mask) \
+	result = __builtin_ctz(mask)
+#endif
+
+#endif /* SIMDBITCOMPAT_H_ */
+
--- a/cpp/simdcomp/include/simdbitpacking.h
+++ b/cpp/simdcomp/include/simdbitpacking.h
@@ -0,0 +1,72 @@
+/**
+ * This code is released under a BSD License.
+ */
+#ifndef SIMDBITPACKING_H_
+#define SIMDBITPACKING_H_
+
+#include "portability.h"
+
+/* SSE2 is required */
+#include <emmintrin.h>
+/* for memset */
+#include <string.h>
+
+#include "simdcomputil.h"
+
+/***
+* Please see example.c for various examples on how to make good use
+* of these functions.
+*/
+
+
+
+/* reads 128 values from "in", writes  "bit" 128-bit vectors to "out".
+ * The input values are masked so that only the least significant "bit" bits are used. */
+void simdpack(const uint32_t *  in,__m128i *  out, const uint32_t bit);
+
+/* reads 128 values from "in", writes  "bit" 128-bit vectors to "out".
+ * The input values are assumed to be less than 1<<bit. */
+void simdpackwithoutmask(const uint32_t *  in,__m128i *  out, const uint32_t bit);
+
+/* reads  "bit" 128-bit vectors from "in", writes  128 values to "out" */
+void simdunpack(const __m128i *  in,uint32_t *  out, const uint32_t bit);
+
+
+
+/* how many compressed bytes are needed to compressed length integers using a bit width of bit with 
+the  simdpackFOR_length function. */
+int simdpack_compressedbytes(int length, const uint32_t bit);
+
+/* like simdpack, but supports an undetermined number of inputs.
+ * This is useful if you need to unpack an array of integers that is not divisible by 128 integers.
+ * Returns a pointer to the (advanced) compressed array. Compressed data is stored in the memory location between 
+ the provided (out) pointer and the returned pointer. */
+__m128i * simdpack_length(const uint32_t *   in, size_t length, __m128i *    out, const uint32_t bit);
+
+/* like simdunpack, but supports an undetermined number of inputs.
+ * This is useful if you need to unpack an array of integers that is not divisible by 128 integers.
+ * Returns a pointer to the (advanced) compressed array. The read compressed data is between the provided 
+ (in) pointer and the returned pointer. */
+const __m128i * simdunpack_length(const __m128i *   in, size_t length, uint32_t * out, const uint32_t bit);
+
+
+
+
+/* like simdpack, but supports an undetermined small number of inputs. This is useful if you need to pack less 
+than 128 integers.
+ * Note that this function is much slower.
+ * Returns a pointer to the (advanced) compressed array. Compressed data is stored in the memory location 
+ between the provided (out) pointer and the returned pointer. */
+__m128i * simdpack_shortlength(const uint32_t *   in, int length, __m128i *    out, const uint32_t bit);
+
+/* like simdunpack, but supports an undetermined small number of inputs. This is useful if you need to unpack less
+ than 128 integers.
+ * Note that this function is much slower.
+ * Returns a pointer to the (advanced) compressed array. The read compressed data is between the provided (in) 
+ pointer and the returned pointer. */
+const __m128i * simdunpack_shortlength(const __m128i *   in, int length, uint32_t * out, const uint32_t bit);
+
+/* given a block of 128 packed values, this function sets the value at index "index" to "value" */
+void simdfastset(__m128i * in128, uint32_t b, uint32_t value, size_t index);
+
+#endif /* SIMDBITPACKING_H_ */
--- a/cpp/simdcomp/include/simdcomp.h
+++ b/cpp/simdcomp/include/simdcomp.h
@@ -0,0 +1,22 @@
+/**
+ * This code is released under a BSD License.
+ */
+
+#ifndef SIMDCOMP_H_
+#define SIMDCOMP_H_
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include "simdbitpacking.h"
+#include "simdcomputil.h"
+#include "simdfor.h"
+#include "simdintegratedbitpacking.h"
+#include "avxbitpacking.h"
+
+#ifdef __cplusplus
+} // extern "C"
+#endif
+
+#endif 
--- a/cpp/simdcomp/include/simdcomputil.h
+++ b/cpp/simdcomp/include/simdcomputil.h
@@ -0,0 +1,54 @@
+/**
+ * This code is released under a BSD License.
+ */
+
+#ifndef SIMDCOMPUTIL_H_
+#define SIMDCOMPUTIL_H_
+
+#include "portability.h"
+
+/* SSE2 is required */
+#include <emmintrin.h>
+
+
+
+
+/* returns the integer logarithm of v (bit width) */
+uint32_t bits(const uint32_t v);
+
+/* max integer logarithm over a range of SIMDBlockSize integers (128 integer) */
+uint32_t maxbits(const uint32_t * begin);
+
+/* same as maxbits, but we specify the number of integers */
+uint32_t maxbits_length(const uint32_t * in,uint32_t length);
+
+enum{ SIMDBlockSize = 128};
+
+
+/* computes (quickly) the minimal value of 128 values */
+uint32_t simdmin(const uint32_t * in);
+
+/* computes (quickly) the minimal value of the specified number of values */
+uint32_t simdmin_length(const uint32_t * in, uint32_t length);
+
+#ifdef __SSE4_1__
+/* computes (quickly) the minimal and maximal value of the specified number of values */
+void simdmaxmin_length(const uint32_t * in, uint32_t length, uint32_t * getmin, uint32_t * getmax);
+
+/* computes (quickly) the minimal and maximal value of the 128 values */
+void simdmaxmin(const uint32_t * in, uint32_t * getmin, uint32_t * getmax);
+
+#endif
+
+/* like maxbit over 128 integers (SIMDBlockSize) with provided initial value 
+   and using differential coding */
+uint32_t simdmaxbitsd1(uint32_t initvalue, const uint32_t * in);
+
+/* like simdmaxbitsd1, but calculates maxbits over |length| integers 
+   with provided initial value. |length| can be any arbitrary value. */
+uint32_t simdmaxbitsd1_length(uint32_t initvalue, const uint32_t * in,
+                uint32_t length);
+
+
+
+#endif /* SIMDCOMPUTIL_H_ */
--- a/cpp/simdcomp/include/simdfor.h
+++ b/cpp/simdcomp/include/simdfor.h
@@ -0,0 +1,72 @@
+/**
+ * This code is released under a BSD License.
+ */
+#ifndef INCLUDE_SIMDFOR_H_
+#define INCLUDE_SIMDFOR_H_
+
+#include "portability.h"
+
+/* SSE2 is required */
+#include <emmintrin.h>
+
+#include "simdcomputil.h"
+#include "simdbitpacking.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/* reads 128 values from "in", writes  "bit" 128-bit vectors to "out" */
+void simdpackFOR(uint32_t initvalue, const uint32_t *  in,__m128i *  out, const uint32_t bit);
+
+
+/* reads "bit" 128-bit vectors from "in", writes  128 values to "out" */
+void simdunpackFOR(uint32_t initvalue, const __m128i *  in,uint32_t *  out, const uint32_t bit);
+
+
+/* how many compressed bytes are needed to compressed length integers using a bit width of bit with 
+the  simdpackFOR_length function. */
+int simdpackFOR_compressedbytes(int length, const uint32_t bit);
+
+/* like simdpackFOR, but supports an undetermined number of inputs. 
+This is useful if you need to pack less than 128 integers. Note that this function is much slower. 
+ Compressed data is stored in the memory location between 
+ the provided (out) pointer and the returned pointer. */
+__m128i * simdpackFOR_length(uint32_t initvalue, const uint32_t *   in, int length, __m128i *    out, const uint32_t bit);
+
+/* like simdunpackFOR, but supports an undetermined number of inputs. 
+This is useful if you need to unpack less than 128 integers. Note that this function is much slower. 
+ The read compressed data is between the provided 
+ (in) pointer and the returned pointer.  */
+const __m128i * simdunpackFOR_length(uint32_t initvalue, const __m128i *   in, int length, uint32_t * out, const uint32_t bit);
+
+
+/* returns the value stored at the specified "slot".
+* */
+uint32_t simdselectFOR(uint32_t initvalue, const __m128i *in, uint32_t bit,
+                int slot);
+
+/* given a block of 128 packed values, this function sets the value at index "index" to "value" */
+void simdfastsetFOR(uint32_t initvalue, __m128i * in, uint32_t bit, uint32_t value, size_t index);
+
+
+/* searches "bit" 128-bit vectors from "in" (= length<=128 encoded integers) for the first encoded uint32 value
+ * which is >= |key|, and returns its position. It is assumed that the values
+ * stored are in sorted order.
+ * The encoded key is stored in "*presult".
+ * The first length decoded integers, ignoring others. If no value is larger or equal to the key,
+ * length is returned. Length should be no larger than 128.
+ *
+ * If no value is larger or equal to the key,
+* length is returned */
+int simdsearchwithlengthFOR(uint32_t initvalue, const __m128i *in, uint32_t bit,
+                int length, uint32_t key, uint32_t *presult);
+
+#ifdef __cplusplus
+} // extern "C"
+#endif
+
+
+
+
+#endif /* INCLUDE_SIMDFOR_H_ */
--- a/cpp/simdcomp/include/simdintegratedbitpacking.h
+++ b/cpp/simdcomp/include/simdintegratedbitpacking.h
@@ -0,0 +1,98 @@
+/**
+ * This code is released under a BSD License.
+ */
+
+#ifndef SIMD_INTEGRATED_BITPACKING_H
+#define SIMD_INTEGRATED_BITPACKING_H
+
+#include "portability.h"
+
+/* SSE2 is required */
+#include <emmintrin.h>
+
+#include "simdcomputil.h"
+#include "simdbitpacking.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/* reads 128 values from "in", writes  "bit" 128-bit vectors to "out"
+   integer values should be in sorted order (for best results).
+   The differences are masked so that only the least significant "bit" bits are used. */
+void simdpackd1(uint32_t initvalue, const uint32_t *  in,__m128i *  out, const uint32_t bit);
+
+
+/* reads 128 values from "in", writes  "bit" 128-bit vectors to "out"
+   integer values should be in sorted order (for best results).
+   The difference values are assumed to be less than 1<<bit. */
+void simdpackwithoutmaskd1(uint32_t initvalue, const uint32_t *  in,__m128i *  out, const uint32_t bit);
+
+
+/* reads "bit" 128-bit vectors from "in", writes  128 values to "out" */
+void simdunpackd1(uint32_t initvalue, const __m128i *  in,uint32_t *  out, const uint32_t bit);
+
+
+/* searches "bit" 128-bit vectors from "in" (= 128 encoded integers) for the first encoded uint32 value
+ * which is >= |key|, and returns its position. It is assumed that the values
+ * stored are in sorted order.
+ * The encoded key is stored in "*presult". If no value is larger or equal to the key,
+* 128 is returned. The pointer initOffset is a pointer to the last four value decoded
+* (when starting out, this can be a zero vector or initialized with _mm_set1_epi32(init)),
+* and the vector gets updated.
+**/
+int
+simdsearchd1(__m128i * initOffset, const __m128i *in, uint32_t bit,
+                uint32_t key, uint32_t *presult);
+
+
+/* searches "bit" 128-bit vectors from "in" (= length<=128 encoded integers) for the first encoded uint32 value
+ * which is >= |key|, and returns its position. It is assumed that the values
+ * stored are in sorted order.
+ * The encoded key is stored in "*presult".
+ * The first length decoded integers, ignoring others. If no value is larger or equal to the key,
+ * length is returned. Length should be no larger than 128.
+ *
+ * If no value is larger or equal to the key,
+* length is returned */
+int simdsearchwithlengthd1(uint32_t initvalue, const __m128i *in, uint32_t bit,
+                int length, uint32_t key, uint32_t *presult);
+
+
+
+/* returns the value stored at the specified "slot".
+* */
+uint32_t simdselectd1(uint32_t initvalue, const __m128i *in, uint32_t bit,
+                int slot);
+
+/* given a block of 128 packed values, this function sets the value at index "index" to "value",
+ * you must somehow know the previous value.
+ * Because of differential coding, all following values are incremented by the offset between this new
+ * value and the old value... 
+ * This functions is useful if you want to modify the last value. 
+ */
+void simdfastsetd1fromprevious( __m128i * in, uint32_t bit, uint32_t previousvalue, uint32_t value, size_t index);
+
+/* given a block of 128 packed values, this function sets the value at index "index" to "value",
+ * This function computes the previous value if needed.
+ * Because of differential coding, all following values are incremented by the offset between this new
+ * value and the old value...
+ * This functions is useful if you want to modify the last value. 
+ */
+void simdfastsetd1(uint32_t initvalue, __m128i * in, uint32_t bit, uint32_t value, size_t index);
+
+
+/*Simply scan the data
+* The pointer initOffset is a pointer to the last four value decoded
+* (when starting out, this can be a zero vector or initialized with _mm_set1_epi32(init);),
+* and the vector gets updated.
+* */
+
+void
+simdscand1(__m128i * initOffset, const __m128i *in, uint32_t bit);
+
+#ifdef __cplusplus
+} // extern "C"
+#endif
+
+#endif
--- a/cpp/simdcomp/makefile
+++ b/cpp/simdcomp/makefile
@@ -0,0 +1,79 @@
+# minimalist makefile
+.SUFFIXES:
+#
+.SUFFIXES: .cpp .o .c .h
+ifeq ($(DEBUG),1)
+CFLAGS = -fPIC  -std=c89 -ggdb -msse4.1 -march=native -Wall -Wextra -Wshadow -fsanitize=undefined  -fno-omit-frame-pointer -fsanitize=address
+else
+CFLAGS = -fPIC -std=c89 -O3 -msse4.1  -march=native -Wall -Wextra -Wshadow
+endif # debug
+LDFLAGS = -shared
+LIBNAME=libsimdcomp.so.0.0.3
+all:  unit unit_chars bitpackingbenchmark $(LIBNAME)
+test:
+	./unit
+	./unit_chars
+install: $(OBJECTS)
+	cp $(LIBNAME) /usr/local/lib
+	ln -s /usr/local/lib/$(LIBNAME) /usr/local/lib/libsimdcomp.so
+	ldconfig
+	cp $(HEADERS) /usr/local/include
+
+
+
+HEADERS=./include/simdbitpacking.h ./include/simdcomputil.h ./include/simdintegratedbitpacking.h ./include/simdcomp.h ./include/simdfor.h ./include/avxbitpacking.h
+
+uninstall:
+	for h in $(HEADERS) ; do rm  /usr/local/$$h; done
+	rm  /usr/local/lib/$(LIBNAME)
+	rm /usr/local/lib/libsimdcomp.so
+	ldconfig
+
+
+OBJECTS= simdbitpacking.o simdintegratedbitpacking.o simdcomputil.o \
+		 simdpackedsearch.o simdpackedselect.o simdfor.o avxbitpacking.o
+
+$(LIBNAME): $(OBJECTS)
+	$(CC) $(CFLAGS) -o $(LIBNAME) $(OBJECTS)  $(LDFLAGS)
+
+
+avxbitpacking.o: ./src/avxbitpacking.c $(HEADERS)
+	$(CC) $(CFLAGS) -c ./src/avxbitpacking.c -Iinclude
+
+
+simdfor.o: ./src/simdfor.c $(HEADERS)
+	$(CC) $(CFLAGS) -c ./src/simdfor.c -Iinclude
+
+
+simdcomputil.o: ./src/simdcomputil.c $(HEADERS)
+	$(CC) $(CFLAGS) -c ./src/simdcomputil.c -Iinclude
+
+simdbitpacking.o: ./src/simdbitpacking.c $(HEADERS)
+	$(CC) $(CFLAGS) -c ./src/simdbitpacking.c -Iinclude
+
+simdintegratedbitpacking.o: ./src/simdintegratedbitpacking.c  $(HEADERS)
+	$(CC) $(CFLAGS) -c ./src/simdintegratedbitpacking.c -Iinclude
+
+simdpackedsearch.o: ./src/simdpackedsearch.c $(HEADERS)
+	$(CC) $(CFLAGS) -c ./src/simdpackedsearch.c -Iinclude
+
+simdpackedselect.o: ./src/simdpackedselect.c $(HEADERS)
+	$(CC) $(CFLAGS) -c ./src/simdpackedselect.c -Iinclude
+
+example: ./example.c    $(HEADERS) $(OBJECTS)
+	$(CC) $(CFLAGS) -o example ./example.c -Iinclude  $(OBJECTS)
+
+unit: ./tests/unit.c    $(HEADERS) $(OBJECTS)
+	$(CC) $(CFLAGS) -o unit ./tests/unit.c -Iinclude  $(OBJECTS)
+
+bitpackingbenchmark: ./benchmarks/bitpackingbenchmark.c    $(HEADERS) $(OBJECTS)
+	$(CC) $(CFLAGS) -o bitpackingbenchmark ./benchmarks/bitpackingbenchmark.c -Iinclude  $(OBJECTS)
+benchmark: ./benchmarks/benchmark.c    $(HEADERS) $(OBJECTS)
+	$(CC) $(CFLAGS) -o benchmark ./benchmarks/benchmark.c -Iinclude  $(OBJECTS)
+dynunit: ./tests/unit.c    $(HEADERS) $(LIBNAME)
+	$(CC) $(CFLAGS) -o dynunit ./tests/unit.c -Iinclude  -lsimdcomp
+
+unit_chars: ./tests/unit_chars.c    $(HEADERS) $(OBJECTS)
+	$(CC) $(CFLAGS) -o unit_chars ./tests/unit_chars.c -Iinclude  $(OBJECTS)
+clean:
+	rm -f unit *.o $(LIBNAME) example benchmark bitpackingbenchmark dynunit unit_chars
--- a/cpp/simdcomp/makefile.vc
+++ b/cpp/simdcomp/makefile.vc
@@ -0,0 +1,104 @@
+
+!IFNDEF MACHINE
+!IF "$(PROCESSOR_ARCHITECTURE)"=="AMD64"
+MACHINE=x64
+!ELSE
+MACHINE=x86
+!ENDIF
+!ENDIF
+
+!IFNDEF DEBUG
+DEBUG=no
+!ENDIF
+
+!IFNDEF CC
+CC=cl.exe
+!ENDIF
+
+!IFNDEF AR
+AR=lib.exe
+!ENDIF
+
+!IFNDEF LINK
+LINK=link.exe
+!ENDIF
+
+!IFNDEF PGO
+PGO=no
+!ENDIF
+
+!IFNDEF PGI
+PGI=no
+!ENDIF
+
+INC = /Iinclude
+
+!IF "$(DEBUG)"=="yes"
+CFLAGS = /nologo /MDd /LDd /Od /Zi /D_DEBUG /RTC1 /W3 /GS /Gm
+ARFLAGS = /nologo
+LDFLAGS = /nologo /debug /nodefaultlib:msvcrt
+!ELSE
+CFLAGS = /nologo /MD /O2 /Zi /DNDEBUG /W3 /Gm- /GS /Gy /Oi /GL /MP
+ARFLAGS = /nologo /LTCG
+LDFLAGS = /nologo /LTCG /DYNAMICBASE /incremental:no /debug /opt:ref,icf
+!ENDIF
+
+!IF "$(PGI)"=="yes"
+LDFLAGS = $(LDFLAGS) /ltcg:pgi
+!ENDIF
+
+!IF "$(PGO)"=="yes"
+LDFLAGS = $(LDFLAGS) /ltcg:pgo
+!ENDIF
+
+LIB_OBJS = simdbitpacking.obj simdintegratedbitpacking.obj simdcomputil.obj \
+	simdpackedsearch.obj simdpackedselect.obj simdfor.obj
+
+
+all: lib dll dynunit unit_chars example benchmark
+# need some good use case scenario to train the instrumented build
+	@if "$(PGI)"=="yes" echo Running PGO training
+	@if "$(PGI)"=="yes" benchmark.exe >nul 2>&1
+	@if "$(PGI)"=="yes" example.exe >nul 2>&1
+
+
+$(LIB_OBJS):
+	$(CC) $(INC) $(CFLAGS) /c src/simdbitpacking.c src/simdintegratedbitpacking.c src/simdcomputil.c \
+		src/simdpackedsearch.c src/simdpackedselect.c src/simdfor.c
+
+lib: $(LIB_OBJS)
+	$(AR) $(ARFLAGS) /OUT:simdcomp_a.lib $(LIB_OBJS)
+
+dll: $(LIB_OBJS)
+	$(LINK) /DLL $(LDFLAGS) /OUT:simdcomp.dll /IMPLIB:simdcomp.lib /DEF:simdcomp.def $(LIB_OBJS)
+
+unit: lib
+	$(CC) $(INC) $(CFLAGS) /c src/unit.c 
+	$(LINK) $(LDFLAGS) /OUT:unit.exe unit.obj simdcomp_a.lib
+
+dynunit: dll
+	$(CC) $(INC) $(CFLAGS) /c src/unit.c 
+	$(LINK) $(LDFLAGS) /OUT:unit.exe unit.obj simdcomp.lib
+
+unit_chars: lib
+	$(CC) $(INC) $(CFLAGS) /c src/unit_chars.c
+	$(LINK) $(LDFLAGS) /OUT:unit_chars.exe unit_chars.obj simdcomp.lib
+
+
+example: lib
+	$(CC) $(INC) $(CFLAGS) /c example.c
+	$(LINK) $(LDFLAGS) /OUT:example.exe example.obj simdcomp.lib
+
+benchmark: lib
+	$(CC) $(INC) $(CFLAGS) /c src/benchmark.c
+	$(LINK) $(LDFLAGS) /OUT:benchmark.exe benchmark.obj simdcomp.lib
+
+clean:
+	del /Q *.obj
+	del /Q *.lib
+	del /Q *.exe
+	del /Q *.dll
+	del /Q *.pgc
+	del /Q *.pgd
+	del /Q *.pdb
+
--- a/cpp/simdcomp/package.json
+++ b/cpp/simdcomp/package.json
@@ -0,0 +1,16 @@
+{
+  "name": "simdcomp",
+  "version": "0.0.3",
+  "repo": "lemire/simdcomp",
+  "description": "A simple C library for compressing lists of integers",
+  "license": "BSD-3-Clause",
+  "src": [
+    "src/simdbitpacking.c",
+    "src/simdcomputil.c",
+    "src/simdintegratedbitpacking.c",
+    "include/simdbitpacking.h",
+    "include/simdcomp.h",
+    "include/simdcomputil.h",
+    "include/simdintegratedbitpacking.h"
+  ]
+}
--- a/cpp/simdcomp/scripts/avxpacking.py
+++ b/cpp/simdcomp/scripts/avxpacking.py
@@ -0,0 +1,182 @@
+#!/usr/bin/env python
+import sys
+def howmany(bit):
+    """ how many values are we going to pack? """
+    return 256
+
+def howmanywords(bit):
+    return (howmany(bit) * bit + 255)/256
+
+def howmanybytes(bit):
+    return howmanywords(bit) * 16
+
+print("""
+/** code generated by avxpacking.py starts here **/
+""")
+
+print("""typedef void (*avxpackblockfnc)(const uint32_t * pin, __m256i * compressed);""")
+print("""typedef void (*avxunpackblockfnc)(const __m256i * compressed, uint32_t * pout);""")
+
+
+
+
+
+
+def plurial(number):
+    if(number <> 1):
+        return "s"
+    else :
+        return ""
+
+print("")
+print("static void avxpackblock0(const uint32_t * pin, __m256i * compressed) {");
+print("  (void)compressed;");
+print("  (void) pin; /* we consumed {0} 32-bit integer{1} */ ".format(howmany(0),plurial(howmany(0))));
+print("}");
+print("")
+
+for bit in range(1,33):
+    print("")
+    print("/* we are going to pack {0} {1}-bit values, touching {2} 256-bit words, using {3} bytes */ ".format(howmany(bit),bit,howmanywords(bit),howmanybytes(bit)))
+    print("static void avxpackblock{0}(const uint32_t * pin, __m256i * compressed) {{".format(bit));
+    print("  const __m256i * in = (const __m256i *)  pin;");
+    print("  /* we are going to touch  {0} 256-bit word{1} */ ".format(howmanywords(bit),plurial(howmanywords(bit))));
+    if(howmanywords(bit) == 1):
+      print("  __m256i w0;")
+    else:
+      print("  __m256i w0, w1;")
+    if( (bit & (bit-1)) <> 0) : print("  __m256i tmp; /* used to store inputs at word boundary */")
+    oldword = 0
+    for j in range(howmany(bit)/8):
+      firstword = j * bit / 32
+      if(firstword > oldword):
+        print("  _mm256_storeu_si256(compressed + {0}, w{1});".format(oldword,oldword%2))
+        oldword = firstword
+      secondword = (j * bit + bit - 1)/32
+      firstshift = (j*bit) % 32
+      if( firstword == secondword):
+          if(firstshift == 0):
+            print("  w{0} = _mm256_lddqu_si256 (in + {1});".format(firstword%2,j))
+          else:
+            print("  w{0} = _mm256_or_si256(w{0},_mm256_slli_epi32(_mm256_lddqu_si256 (in + {1}) , {2}));".format(firstword%2,j,firstshift))
+      else:
+          print("  tmp = _mm256_lddqu_si256 (in + {0});".format(j))
+          print("  w{0} = _mm256_or_si256(w{0},_mm256_slli_epi32(tmp , {2}));".format(firstword%2,j,firstshift))
+          secondshift = 32-firstshift
+          print("  w{0} = _mm256_srli_epi32(tmp,{2});".format(secondword%2,j,secondshift))
+    print("  _mm256_storeu_si256(compressed + {0}, w{1});".format(secondword,secondword%2))
+    print("}");
+    print("")
+
+
+print("")
+print("static void avxpackblockmask0(const uint32_t * pin, __m256i * compressed) {");
+print("  (void)compressed;");
+print("  (void) pin; /* we consumed {0} 32-bit integer{1} */ ".format(howmany(0),plurial(howmany(0))));
+print("}");
+print("")
+
+for bit in range(1,33):
+    print("")
+    print("/* we are going to pack {0} {1}-bit values, touching {2} 256-bit words, using {3} bytes */ ".format(howmany(bit),bit,howmanywords(bit),howmanybytes(bit)))
+    print("static void avxpackblockmask{0}(const uint32_t * pin, __m256i * compressed) {{".format(bit));
+    print("  /* we are going to touch  {0} 256-bit word{1} */ ".format(howmanywords(bit),plurial(howmanywords(bit))));
+    if(howmanywords(bit) == 1):
+      print("  __m256i w0;")
+    else:
+      print("  __m256i w0, w1;")
+    print("  const __m256i * in = (const __m256i *) pin;");
+    if(bit < 32): print("  const __m256i mask = _mm256_set1_epi32({0});".format((1<<bit)-1));
+    def maskfnc(x):
+        if(bit == 32): return x
+        return " _mm256_and_si256 ( mask, {0}) ".format(x)
+    if( (bit & (bit-1)) <> 0) : print("  __m256i tmp; /* used to store inputs at word boundary */")
+    oldword = 0
+    for j in range(howmany(bit)/8):
+      firstword = j * bit / 32
+      if(firstword > oldword):
+        print("  _mm256_storeu_si256(compressed + {0}, w{1});".format(oldword,oldword%2))
+        oldword = firstword
+      secondword = (j * bit + bit - 1)/32
+      firstshift = (j*bit) % 32
+      loadstr = maskfnc(" _mm256_lddqu_si256 (in + {0}) ".format(j))
+      if( firstword == secondword):
+          if(firstshift == 0):
+            print("  w{0} = {1};".format(firstword%2,loadstr))
+          else:
+            print("  w{0} = _mm256_or_si256(w{0},_mm256_slli_epi32({1} , {2}));".format(firstword%2,loadstr,firstshift))
+      else:
+          print("  tmp = {0};".format(loadstr))
+          print("  w{0} = _mm256_or_si256(w{0},_mm256_slli_epi32(tmp , {2}));".format(firstword%2,j,firstshift))
+          secondshift = 32-firstshift
+          print("  w{0} = _mm256_srli_epi32(tmp,{2});".format(secondword%2,j,secondshift))
+    print("  _mm256_storeu_si256(compressed + {0}, w{1});".format(secondword,secondword%2))
+    print("}");
+    print("")
+
+
+print("static void avxunpackblock0(const __m256i * compressed, uint32_t * pout) {");
+print("  (void) compressed;");
+print("  memset(pout,0,{0});".format(howmany(0)));
+print("}");
+print("")
+
+for bit in range(1,33):
+    print("")
+    print("/* we packed {0} {1}-bit values, touching {2} 256-bit words, using {3} bytes */ ".format(howmany(bit),bit,howmanywords(bit),howmanybytes(bit)))
+    print("static void avxunpackblock{0}(const __m256i * compressed, uint32_t * pout) {{".format(bit));
+    print("  /* we are going to access  {0} 256-bit word{1} */ ".format(howmanywords(bit),plurial(howmanywords(bit))));
+    if(howmanywords(bit) == 1):
+      print("  __m256i w0;")
+    else:
+      print("  __m256i w0, w1;")
+    print("  __m256i * out = (__m256i *) pout;");
+    if(bit < 32): print("  const __m256i mask = _mm256_set1_epi32({0});".format((1<<bit)-1));
+    maskstr = " _mm256_and_si256 ( mask, {0}) "
+    if (bit == 32) : maskstr = " {0} " # no need
+    oldword = 0
+    print("  w0 = _mm256_lddqu_si256 (compressed);")
+    for j in range(howmany(bit)/8):
+      firstword = j * bit / 32
+      secondword = (j * bit + bit - 1)/32
+      if(secondword > oldword):
+        print("  w{0} = _mm256_lddqu_si256 (compressed + {1});".format(secondword%2,secondword))
+        oldword = secondword
+      firstshift = (j*bit) % 32
+      firstshiftstr = "_mm256_srli_epi32( w{0} , "+str(firstshift)+") "
+      if(firstshift == 0):
+          firstshiftstr =" w{0} " # no need
+      wfirst = firstshiftstr.format(firstword%2)
+      if( firstword == secondword):
+          if(firstshift + bit <> 32):
+            wfirst  = maskstr.format(wfirst)
+          print("  _mm256_storeu_si256(out + {0}, {1});".format(j,wfirst))
+      else:
+          secondshift = (32-firstshift)
+          wsecond = "_mm256_slli_epi32( w{0} , {1} ) ".format((firstword+1)%2,secondshift)
+          wfirstorsecond = " _mm256_or_si256 ({0},{1}) ".format(wfirst,wsecond)
+          wfirstorsecond = maskstr.format(wfirstorsecond)
+          print("  _mm256_storeu_si256(out + {0},\n    {1});".format(j,wfirstorsecond))
+    print("}");
+    print("")
+
+
+print("static avxpackblockfnc avxfuncPackArr[] = {")
+for bit in range(0,32):
+  print("&avxpackblock{0},".format(bit))
+print("&avxpackblock32")
+print("};")
+
+print("static avxpackblockfnc avxfuncPackMaskArr[] = {")
+for bit in range(0,32):
+  print("&avxpackblockmask{0},".format(bit))
+print("&avxpackblockmask32")
+print("};")
+
+
+print("static avxunpackblockfnc avxfuncUnpackArr[] = {")
+for bit in range(0,32):
+  print("&avxunpackblock{0},".format(bit))
+print("&avxunpackblock32")
+print("};")
+print("/** code generated by avxpacking.py ends here **/")
--- a/cpp/simdcomp/scripts/simdfor.py
+++ b/cpp/simdcomp/scripts/simdfor.py
@@ -0,0 +1,152 @@
+#!/usr/bin/env python3
+
+
+from math import ceil
+
+print("""
+/**
+* Blablabla
+*
+*/
+
+""");
+
+def mask(bit):
+  return str((1 << bit) - 1)
+
+for length in [32]:
+  print("""
+static __m128i  iunpackFOR0(__m128i initOffset, const __m128i *   _in , uint32_t *    _out) {
+    __m128i       *out = (__m128i*)(_out);
+    int i;
+    (void) _in;
+    for (i = 0; i < 8; ++i) {
+        _mm_store_si128(out++, initOffset);
+    	_mm_store_si128(out++, initOffset);
+        _mm_store_si128(out++, initOffset);
+        _mm_store_si128(out++, initOffset);
+    }
+
+    return initOffset;
+}
+
+  """)
+  print("""
+
+static void ipackFOR0(__m128i initOffset , const uint32_t *   _in , __m128i *  out  ) {
+    (void) initOffset;
+    (void) _in;
+    (void) out;
+}
+""") 
+  for bit in range(1,33):
+    offsetVar = " initOffset";
+    print("""  
+static void ipackFOR"""+str(bit)+"""(__m128i """+offsetVar+""", const uint32_t *   _in, __m128i *   out) {
+    const __m128i       *in = (const __m128i*)(_in);
+    __m128i    OutReg;
+
+      """);
+    
+    if (bit != 32):
+      print("    __m128i CurrIn = _mm_load_si128(in);");
+      print("    __m128i InReg = _mm_sub_epi32(CurrIn, initOffset);");
+    else:
+      print("    __m128i InReg = _mm_load_si128(in);");
+      print("    (void) initOffset;");
+
+
+    inwordpointer = 0
+    valuecounter = 0
+    for k in range(ceil((length * bit) / 32)):
+      if(valuecounter == length): break
+      for x in range(inwordpointer,32,bit):
+        if(x!=0) :
+          print("    OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, " + str(x) + "));");
+        else:
+          print("    OutReg = InReg; ");
+        if((x+bit>=32) ):
+          while(inwordpointer<32):
+            inwordpointer += bit
+          print("    _mm_store_si128(out, OutReg);");
+          print("");
+
+          if(valuecounter + 1 < length):
+            print("    ++out;")
+          inwordpointer -= 32;
+          if(inwordpointer>0):
+            print("    OutReg = _mm_srli_epi32(InReg, " + str(bit) + " - " + str(inwordpointer) + ");");
+        if(valuecounter + 1 < length):
+          print("    ++in;") 
+
+          if (bit != 32):
+            print("    CurrIn = _mm_load_si128(in);");
+            print("    InReg = _mm_sub_epi32(CurrIn, initOffset);");
+          else:
+            print("    InReg = _mm_load_si128(in);");
+          print("");
+        valuecounter = valuecounter + 1
+        if(valuecounter == length): break
+    assert(valuecounter == length)
+    print("\n}\n\n""")
+
+  for bit in range(1,32):
+    offsetVar = " initOffset";
+    print("""\n
+static __m128i iunpackFOR"""+str(bit)+"""(__m128i """+offsetVar+""", const  __m128i*   in, uint32_t *   _out) {
+      """);
+    print("""    __m128i*   out = (__m128i*)(_out);
+    __m128i    InReg = _mm_load_si128(in);
+    __m128i    OutReg;    
+    __m128i     tmp;
+    const __m128i mask =  _mm_set1_epi32((1U<<"""+str(bit)+""")-1);
+
+    """);
+
+    MainText = "";
+
+    MainText += "\n";
+    inwordpointer = 0
+    valuecounter = 0
+    for k in range(ceil((length * bit) / 32)):
+      for x in range(inwordpointer,32,bit):
+        if(valuecounter == length): break
+        if (x > 0):
+          MainText += "    tmp = _mm_srli_epi32(InReg," + str(x) +");\n"; 
+        else:
+          MainText += "    tmp = InReg;\n"; 
+        if(x+bit<32):
+          MainText += "    OutReg = _mm_and_si128(tmp, mask);\n";
+        else:
+          MainText += "    OutReg = tmp;\n";        
+        if((x+bit>=32) ):      
+          while(inwordpointer<32):
+            inwordpointer += bit
+          if(valuecounter + 1 < length):
+             MainText += "    ++in;"
+             MainText += "    InReg = _mm_load_si128(in);\n";
+          inwordpointer -= 32;
+          if(inwordpointer>0):
+            MainText += "    OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, " + str(bit) + "-" + str(inwordpointer) + "), mask));\n\n";
+        if (bit != 32):
+          MainText += "    OutReg = _mm_add_epi32(OutReg, initOffset);\n"; 
+        MainText += "    _mm_store_si128(out++, OutReg);\n\n"; 
+        MainText += "";
+        valuecounter = valuecounter + 1
+        if(valuecounter == length): break
+    assert(valuecounter == length)
+    print(MainText)
+    print("    return initOffset;");
+    print("\n}\n\n")
+  print("""
+static __m128i iunpackFOR32(__m128i initvalue , const  __m128i*   in, uint32_t *    _out) {
+	__m128i * mout = (__m128i *)_out;
+	__m128i invec;
+	size_t k;
+	for(k = 0; k < 128/4; ++k) {
+		invec =  _mm_load_si128(in++);
+	    _mm_store_si128(mout++, invec);
+	}
+	return invec;
+}
+  """)
--- a/cpp/simdcomp/simdcomp.def
+++ b/cpp/simdcomp/simdcomp.def
@@ -0,0 +1,40 @@
+EXPORTS
+	simdpack
+	simdpackwithoutmask
+	simdunpack
+	bits
+	maxbits
+	maxbits_length
+	simdmin
+	simdmin_length
+	simdmaxmin
+	simdmaxmin_length
+	simdmaxbitsd1
+	simdmaxbitsd1_length
+	simdpackd1
+	simdpackwithoutmaskd1
+	simdunpackd1
+	simdsearchd1
+	simdsearchwithlengthd1
+	simdselectd1
+	simdpackFOR
+	simdselectFOR
+	simdsearchwithlengthFOR
+	simdunpackFOR
+	simdmin_length
+	simdmaxmin
+	simdmaxmin_length
+	simdpack_length
+	simdpackFOR_length
+	simdunpackFOR_length
+	simdpack_shortlength
+	simdfastsetFOR
+	simdfastset
+	simdfastsetd1
+	simdunpack_length
+	simdunpack_shortlength
+	simdsearchwithlengthFOR
+	simdscand1
+	simdfastsetd1fromprevious
+	simdfastsetd1
+
--- a/cpp/simdcomp/src/avxbitpacking.c
+++ b/cpp/simdcomp/src/avxbitpacking.c
--- a/cpp/simdcomp/src/simdbitpacking.c
+++ b/cpp/simdcomp/src/simdbitpacking.c
--- a/cpp/simdcomp/src/simdcomputil.c
+++ b/cpp/simdcomp/src/simdcomputil.c
@@ -0,0 +1,234 @@
+/**
+ * This code is released under a BSD License.
+ */
+
+#include "simdcomputil.h"
+#ifdef __SSE4_1__
+#include <smmintrin.h>
+#endif
+#include <assert.h>
+
+#define Delta(curr, prev) \
+    _mm_sub_epi32(curr, \
+            _mm_or_si128(_mm_slli_si128(curr, 4), _mm_srli_si128(prev, 12)))
+
+/* returns the integer logarithm of v (bit width) */
+uint32_t bits(const uint32_t v) {
+#ifdef _MSC_VER
+    unsigned long answer;
+    if (v == 0) {
+        return 0;
+    }
+    _BitScanReverse(&answer, v);
+    return answer + 1;
+#else
+    return v == 0 ? 0 : 32 - __builtin_clz(v); /* assume GCC-like compiler if not microsoft */
+#endif
+}
+
+
+
+static uint32_t maxbitas32int(const __m128i accumulator) {
+	const __m128i _tmp1 = _mm_or_si128(_mm_srli_si128(accumulator, 8), accumulator); /* (A,B,C,D) xor (0,0,A,B) = (A,B,C xor A,D xor B)*/
+	const __m128i _tmp2 = _mm_or_si128(_mm_srli_si128(_tmp1, 4), _tmp1); /*  (A,B,C xor A,D xor B) xor  (0,0,0,C xor A)*/
+	uint32_t ans =  _mm_cvtsi128_si32(_tmp2);
+	return bits(ans);
+}
+
+SIMDCOMP_PURE uint32_t maxbits(const uint32_t * begin) {
+	    const __m128i* pin = (const __m128i*)(begin);
+	    __m128i accumulator = _mm_loadu_si128(pin);
+	    uint32_t k = 1;
+	    for(; 4*k < SIMDBlockSize; ++k) {
+	    	__m128i newvec = _mm_loadu_si128(pin+k);
+	        accumulator = _mm_or_si128(accumulator,newvec);
+	    }
+	    return maxbitas32int(accumulator);
+}
+static uint32_t orasint(const __m128i accumulator) {
+	const __m128i _tmp1 = _mm_or_si128(_mm_srli_si128(accumulator, 8), accumulator); /* (A,B,C,D) xor (0,0,A,B) = (A,B,C xor A,D xor B)*/
+	const __m128i _tmp2 = _mm_or_si128(_mm_srli_si128(_tmp1, 4), _tmp1); /*  (A,B,C xor A,D xor B) xor  (0,0,0,C xor A)*/
+	return  _mm_cvtsi128_si32(_tmp2);
+}
+
+#ifdef __SSE4_1__
+
+static uint32_t minasint(const __m128i accumulator) {
+	const __m128i _tmp1 = _mm_min_epu32(_mm_srli_si128(accumulator, 8), accumulator); /* (A,B,C,D) xor (0,0,A,B) = (A,B,C xor A,D xor B)*/
+	const __m128i _tmp2 = _mm_min_epu32(_mm_srli_si128(_tmp1, 4), _tmp1); /*  (A,B,C xor A,D xor B) xor  (0,0,0,C xor A)*/
+	return  _mm_cvtsi128_si32(_tmp2);
+}
+
+static uint32_t maxasint(const __m128i accumulator) {
+	const __m128i _tmp1 = _mm_max_epu32(_mm_srli_si128(accumulator, 8), accumulator); /* (A,B,C,D) xor (0,0,A,B) = (A,B,C xor A,D xor B)*/
+	const __m128i _tmp2 = _mm_max_epu32(_mm_srli_si128(_tmp1, 4), _tmp1); /*  (A,B,C xor A,D xor B) xor  (0,0,0,C xor A)*/
+	return  _mm_cvtsi128_si32(_tmp2);
+}
+
+uint32_t simdmin(const uint32_t * in) {
+    const __m128i* pin = (const __m128i*)(in);
+    __m128i accumulator =  _mm_loadu_si128(pin);
+     uint32_t k = 1;
+     for(; 4*k < SIMDBlockSize; ++k) {
+    	 __m128i newvec = _mm_loadu_si128(pin+k);
+         accumulator = _mm_min_epu32(accumulator,newvec);
+     }
+     return minasint(accumulator);
+}
+
+void simdmaxmin(const uint32_t * in, uint32_t * getmin, uint32_t * getmax) {
+    const __m128i* pin = (const __m128i*)(in);
+    __m128i minaccumulator =  _mm_loadu_si128(pin);
+    __m128i maxaccumulator =  minaccumulator;
+    uint32_t k = 1;
+     for(; 4*k < SIMDBlockSize; ++k) {
+    	 __m128i newvec = _mm_loadu_si128(pin+k);
+         minaccumulator = _mm_min_epu32(minaccumulator,newvec);
+         maxaccumulator = _mm_max_epu32(maxaccumulator,newvec);
+     }
+     *getmin = minasint(minaccumulator);
+     *getmax = maxasint(maxaccumulator);
+}
+
+
+uint32_t simdmin_length(const uint32_t * in, uint32_t length) {
+	uint32_t currentmin = 0xFFFFFFFF;
+	uint32_t lengthdividedby4 = length / 4;
+	uint32_t offset = lengthdividedby4 * 4;
+	uint32_t k;
+	if (lengthdividedby4 > 0) {
+		const __m128i* pin = (const __m128i*)(in);
+		__m128i accumulator = _mm_loadu_si128(pin);
+		k = 1;
+		for(; 4*k < lengthdividedby4 * 4; ++k) {
+			__m128i newvec = _mm_loadu_si128(pin+k);
+			accumulator = _mm_min_epu32(accumulator,newvec);
+		}
+		currentmin = minasint(accumulator);
+	}
+	for (k = offset; k < length; ++k)
+		if (in[k] < currentmin)
+			currentmin = in[k];
+	return currentmin;
+}
+
+void simdmaxmin_length(const uint32_t * in, uint32_t length, uint32_t * getmin, uint32_t * getmax) {
+	uint32_t lengthdividedby4 = length / 4;
+	uint32_t offset = lengthdividedby4 * 4;
+	uint32_t k;
+	*getmin = 0xFFFFFFFF;
+	*getmax = 0;
+	if (lengthdividedby4 > 0) {
+		const __m128i* pin = (const __m128i*)(in);
+		__m128i minaccumulator = _mm_loadu_si128(pin);
+		__m128i maxaccumulator = minaccumulator;
+		k = 1;
+		for(; 4*k < lengthdividedby4 * 4; ++k) {
+			__m128i newvec = _mm_loadu_si128(pin+k);
+			minaccumulator = _mm_min_epu32(minaccumulator,newvec);
+			maxaccumulator = _mm_max_epu32(maxaccumulator,newvec);
+		}
+		*getmin = minasint(minaccumulator);
+		*getmax = maxasint(maxaccumulator);
+	}
+	for (k = offset; k < length; ++k) {
+		if (in[k] < *getmin)
+			*getmin = in[k];
+		if (in[k] > *getmax)
+			*getmax = in[k];
+	}
+}
+
+#endif
+
+SIMDCOMP_PURE uint32_t maxbits_length(const uint32_t * in,uint32_t length) {
+	  uint32_t k;
+	  uint32_t lengthdividedby4 = length / 4;
+	  uint32_t offset = lengthdividedby4 * 4;
+	  uint32_t bigxor = 0;
+	  if(lengthdividedby4 > 0) {
+		    const __m128i* pin = (const __m128i*)(in);
+		    __m128i accumulator = _mm_loadu_si128(pin);
+		    k = 1;
+		    for(; 4*k < 4*lengthdividedby4; ++k) {
+		    	__m128i newvec = _mm_loadu_si128(pin+k);
+		        accumulator = _mm_or_si128(accumulator,newvec);
+		    }
+		    bigxor = orasint(accumulator);
+	  }
+	  for(k = offset; k < length; ++k)
+		  bigxor |= in[k];
+	  return bits(bigxor);
+}
+
+
+/* maxbit over 128 integers (SIMDBlockSize) with provided initial value */
+uint32_t simdmaxbitsd1(uint32_t initvalue, const uint32_t * in) {
+    __m128i  initoffset = _mm_set1_epi32 (initvalue);
+    const __m128i* pin = (const __m128i*)(in);
+    __m128i newvec = _mm_loadu_si128(pin);
+    __m128i accumulator = Delta(newvec , initoffset);
+    __m128i oldvec = newvec;
+    uint32_t k = 1;
+    for(; 4*k < SIMDBlockSize; ++k) {
+        newvec = _mm_loadu_si128(pin+k);
+        accumulator = _mm_or_si128(accumulator,Delta(newvec , oldvec));
+        oldvec = newvec;
+    }
+    initoffset = oldvec;
+    return maxbitas32int(accumulator);
+}
+
+
+/* maxbit over |length| integers with provided initial value */
+uint32_t simdmaxbitsd1_length(uint32_t initvalue, const uint32_t * in,
+                uint32_t length) {
+    __m128i newvec;
+    __m128i oldvec;
+    __m128i initoffset;
+    __m128i accumulator;
+    const __m128i *pin;
+    uint32_t tmparray[4];
+    uint32_t k = 1;
+    uint32_t acc;
+
+    assert(length > 0);
+
+    pin = (const __m128i *)(in);
+    initoffset = _mm_set1_epi32(initvalue);
+    switch (length) {
+      case 1:
+        newvec = _mm_set1_epi32(in[0]);
+        break;
+      case 2:
+        newvec = _mm_setr_epi32(in[0], in[1], in[1], in[1]);
+        break;
+      case 3:
+        newvec = _mm_setr_epi32(in[0], in[1], in[2], in[2]);
+        break;
+      default:
+        newvec = _mm_loadu_si128(pin);
+        break;
+    }
+    accumulator = Delta(newvec, initoffset);
+    oldvec = newvec;
+
+    /* process 4 integers and build an accumulator */
+    while (k * 4 + 4 <= length) {
+        newvec = _mm_loadu_si128(pin + k);
+        accumulator = _mm_or_si128(accumulator, Delta(newvec, oldvec));
+        oldvec = newvec;
+        k++;
+    }
+
+    /* extract the accumulator as an integer */
+    _mm_storeu_si128((__m128i *)(tmparray), accumulator);
+    acc = tmparray[0] | tmparray[1] | tmparray[2] | tmparray[3];
+
+    /* now process the remaining integers */
+    for (k *= 4; k < length; k++)
+        acc |= in[k] - (k == 0 ? initvalue : in[k - 1]);
+
+    /* return the number of bits */
+    return bits(acc);
+}
--- a/cpp/simdcomp/src/simdfor.c
+++ b/cpp/simdcomp/src/simdfor.c
--- a/cpp/simdcomp/src/simdintegratedbitpacking.c
+++ b/cpp/simdcomp/src/simdintegratedbitpacking.c
--- a/cpp/simdcomp/src/simdpackedsearch.c
+++ b/cpp/simdcomp/src/simdpackedsearch.c
--- a/cpp/simdcomp/src/simdpackedselect.c
+++ b/cpp/simdcomp/src/simdpackedselect.c
--- a/cpp/simdcomp/tests/unit.c
+++ b/cpp/simdcomp/tests/unit.c
@@ -0,0 +1,900 @@
+/**
+ * This code is released under a BSD License.
+ */
+#include <assert.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include "simdcomp.h"
+
+
+
+int testshortpack() {
+	int bit;
+	size_t i;
+	size_t length;
+	__m128i * bb;
+	srand(0);
+	printf("testshortpack\n");
+	for (bit = 0; bit < 32; ++bit) {
+		const size_t N = 128;
+		uint32_t * data = malloc(N * sizeof(uint32_t));
+		uint32_t * backdata = malloc(N * sizeof(uint32_t));
+		uint32_t * buffer = malloc((2 * N + 1024) * sizeof(uint32_t));
+
+		for (i = 0; i < N; ++i) {
+			data[i] = rand() & ((1 << bit) - 1);
+		}
+		for (length = 0; length <= N; ++length) {
+			for (i = 0; i < N; ++i) {
+				backdata[i] = 0;
+			}
+			bb = simdpack_shortlength(data, length, (__m128i *) buffer,
+					bit);
+			if((bb - (__m128i *) buffer) * sizeof(__m128i) != (unsigned) simdpack_compressedbytes(length,bit)) {
+			 printf("bug\n");
+			 return -1;
+			}
+			simdunpack_shortlength((__m128i *) buffer, length,
+					backdata, bit);
+			for (i = 0; i < length; ++i) {
+
+				if (data[i] != backdata[i]) {
+				    printf("bug\n");
+					return -1;
+				}
+			}
+		}
+		free(data);
+		free(backdata);
+		free(buffer);
+	}
+	return 0;
+}
+
+int testlongpack() {
+	int bit;
+	size_t i;
+	size_t length;
+	__m128i * bb;
+	srand(0);
+	printf("testlongpack\n");
+	for (bit = 0; bit < 32; ++bit) {
+		const size_t N = 2048;
+		uint32_t * data = malloc(N * sizeof(uint32_t));
+		uint32_t * backdata = malloc(N * sizeof(uint32_t));
+		uint32_t * buffer = malloc((2 * N + 1024) * sizeof(uint32_t));
+
+		for (i = 0; i < N; ++i) {
+			data[i] = rand() & ((1 << bit) - 1);
+		}
+		for (length = 0; length <= N; ++length) {
+			for (i = 0; i < N; ++i) {
+				backdata[i] = 0;
+			}
+			bb = simdpack_length(data, length, (__m128i *) buffer,
+					bit);
+			if((bb - (__m128i *) buffer) * sizeof(__m128i) != (unsigned) simdpack_compressedbytes(length,bit)) {
+			 printf("bug\n");
+			 return -1;
+			}
+			simdunpack_length((__m128i *) buffer, length,
+					backdata, bit);
+			for (i = 0; i < length; ++i) {
+
+				if (data[i] != backdata[i]) {
+				    printf("bug\n");
+					return -1;
+				}
+			}
+		}
+		free(data);
+		free(backdata);
+		free(buffer);
+	}
+	return 0;
+}
+
+
+
+int testset() {
+	int bit;
+	size_t i;
+	const size_t N = 128;
+	uint32_t * data = malloc(N * sizeof(uint32_t));
+	uint32_t * backdata = malloc(N * sizeof(uint32_t));
+	uint32_t * buffer = malloc((2 * N + 1024) * sizeof(uint32_t));
+
+	srand(0);
+
+	for (bit = 0; bit < 32; ++bit) {
+		printf("simple set %d \n",bit);
+
+		for (i = 0; i < N; ++i) {
+			data[i] = rand() & ((1 << bit) - 1);
+		}
+		for (i = 0; i < N; ++i) {
+			backdata[i] = 0;
+		}
+		simdpack(data, (__m128i *) buffer, bit);
+		simdunpack((__m128i *) buffer, backdata, bit);
+		for (i = 0; i < N; ++i) {
+			if (data[i] != backdata[i]) {
+			    printf("bug\n");
+				return -1;
+			}
+		}
+
+		for(i = N  ; i > 0; i--) {
+			simdfastset((__m128i *) buffer, bit, data[N - i], i - 1);
+		}
+		simdunpack((__m128i *) buffer, backdata, bit);
+		for (i = 0; i < N; ++i) {
+			if (data[i] != backdata[N - i - 1]) {
+			    printf("bug\n");
+				return -1;
+			}
+		}
+		simdpack(data, (__m128i *) buffer, bit);
+		for(i = 1  ; i <= N; i++) {
+			simdfastset((__m128i *) buffer, bit, data[i - 1], i - 1);
+		}
+		simdunpack((__m128i *) buffer, backdata, bit);
+		for (i = 0; i < N; ++i) {
+			if (data[i] != backdata[i]) {
+			    printf("bug\n");
+				return -1;
+			}
+		}
+
+	}
+	free(data);
+	free(backdata);
+	free(buffer);
+
+	return 0;
+}
+
+#ifdef __SSE4_1__
+
+int testsetd1() {
+	int bit;
+	size_t i;
+	uint32_t newvalue;
+	const size_t N = 128;
+	uint32_t * data = malloc(N * sizeof(uint32_t));
+	uint32_t * datazeroes = malloc(N * sizeof(uint32_t));
+
+	uint32_t * backdata = malloc(N * sizeof(uint32_t));
+	uint32_t * buffer = malloc((2 * N + 1024) * sizeof(uint32_t));
+
+	srand(0);
+	for (bit = 0; bit < 32; ++bit) {
+		printf("simple set d1 %d \n",bit);
+		data[0] = rand() & ((1 << bit) - 1);
+		datazeroes[0] = 0;
+
+		for (i = 1; i < N; ++i) {
+			data[i] = data[i - 1] + (rand() & ((1 << bit) - 1));
+			datazeroes[i] = 0;
+		}
+		for (i = 0; i < N; ++i) {
+			backdata[i] = 0;
+		}
+		simdpackd1(0,datazeroes, (__m128i *) buffer, bit);
+ 	    for(i = 1  ; i <= N; i++) {
+			simdfastsetd1(0,(__m128i *) buffer, bit, data[i - 1], i - 1);
+			newvalue = simdselectd1(0, (const __m128i *) buffer, bit,i - 1);
+			if( newvalue != data[i-1] ) {
+				printf("bad set-select\n");
+				return -1;
+			}
+		}
+		simdunpackd1(0,(__m128i *) buffer, backdata, bit);
+		for (i = 0; i < N; ++i) {
+			if (data[i] != backdata[i])
+				return -1;
+		}
+	}
+	free(data);
+	free(backdata);
+	free(buffer);
+        free(datazeroes);
+	return 0;
+}
+#endif
+
+int testsetFOR() {
+	int bit;
+	size_t i;
+	uint32_t newvalue;
+	const size_t N = 128;
+	uint32_t * data = malloc(N * sizeof(uint32_t));
+	uint32_t * datazeroes = malloc(N * sizeof(uint32_t));
+
+	uint32_t * backdata = malloc(N * sizeof(uint32_t));
+	uint32_t * buffer = malloc((2 * N + 1024) * sizeof(uint32_t));
+
+	srand(0);
+	for (bit = 0; bit < 32; ++bit) {
+		printf("simple set FOR %d \n",bit);
+		for (i = 0; i < N; ++i) {
+			data[i] = (rand() & ((1 << bit) - 1));
+			datazeroes[i] = 0;
+		}
+		for (i = 0; i < N; ++i) {
+			backdata[i] = 0;
+		}
+		simdpackFOR(0,datazeroes, (__m128i *) buffer, bit);
+ 	    for(i = 1  ; i <= N; i++) {
+ 	    	simdfastsetFOR(0,(__m128i *) buffer, bit, data[i - 1], i - 1);
+			newvalue = simdselectFOR(0, (const __m128i *) buffer, bit,i - 1);
+			if( newvalue != data[i-1] ) {
+				printf("bad set-select\n");
+				return -1;
+			}
+		}
+		simdunpackFOR(0,(__m128i *) buffer, backdata, bit);
+		for (i = 0; i < N; ++i) {
+			if (data[i] != backdata[i])
+				return -1;
+		}
+	}
+	free(data);
+	free(backdata);
+	free(buffer);
+        free(datazeroes);
+	return 0;
+}
+
+int testshortFORpack() {
+	int bit;
+	size_t i;
+	__m128i * rb;
+	size_t length;
+	uint32_t offset = 7;
+	srand(0);
+	for (bit = 0; bit < 32; ++bit) {
+		const size_t N = 128;
+		uint32_t * data = malloc(N * sizeof(uint32_t));
+		uint32_t * backdata = malloc(N * sizeof(uint32_t));
+		uint32_t * buffer = malloc((2 * N + 1024) * sizeof(uint32_t));
+
+		for (i = 0; i < N; ++i) {
+			data[i] = (rand() & ((1 << bit) - 1)) + offset;
+		}
+		for (length = 0; length <= N; ++length) {
+			for (i = 0; i < N; ++i) {
+				backdata[i] = 0;
+			}
+			rb = simdpackFOR_length(offset,data, length, (__m128i *) buffer,
+					bit);
+		    if(((rb - (__m128i *) buffer)*sizeof(__m128i)) != (unsigned) simdpackFOR_compressedbytes(length,bit)) {
+		      return -1;
+		    }
+			simdunpackFOR_length(offset,(__m128i *) buffer, length,
+					backdata, bit);
+			for (i = 0; i < length; ++i) {
+
+				if (data[i] != backdata[i])
+					return -1;
+			}
+		}
+		free(data);
+		free(backdata);
+		free(buffer);
+	}
+	return 0;
+}
+
+
+#ifdef __AVX2__
+
+int testbabyavx() {
+	int bit;
+	int trial;
+	unsigned int i,j;
+	const size_t N = AVXBlockSize;
+	srand(0);
+	printf("testbabyavx\n");
+	printf("bit = ");
+	for (bit = 0; bit < 32; ++bit) {
+		printf(" %d ",bit);
+		fflush(stdout);
+		for(trial = 0; trial < 100; ++trial) {
+			uint32_t * data = malloc(N * sizeof(uint32_t)+ 64 * sizeof(uint32_t));
+			uint32_t * backdata = malloc(N * sizeof(uint32_t) + 64 * sizeof(uint32_t) );
+			__m256i * buffer = malloc((2 * N + 1024) * sizeof(uint32_t) + 32);
+
+			for (i = 0; i < N; ++i) {
+				data[i] = rand() & ((uint32_t)(1 << bit) - 1);
+			}
+			for (i = 0; i < N; ++i) {
+				backdata[i] = 0;
+			}
+            if(avxmaxbits(data) != maxbits_length(data,N)) {
+            	printf("avxmaxbits is buggy\n");
+				return -1;
+            }
+
+			avxpackwithoutmask(data, buffer, bit);
+			avxunpack(buffer, backdata, bit);
+			for (i = 0; i < AVXBlockSize; ++i) {
+				if (data[i] != backdata[i]) {
+					printf("bug\n");
+					for (j = 0; j < N; ++j) {
+						if (data[j] != backdata[j]) {
+							printf("data[%d]=%d v.s. backdata[%d]=%d\n",j,data[j],j,backdata[j]);
+						} else {
+							printf("data[%d]=%d\n",j,data[j]);
+						}
+					}
+					return -1;
+				}
+			}
+			free(data);
+			free(backdata);
+			free(buffer);
+		}
+	}
+	printf("\n");
+	return 0;
+}
+
+int testavx2() {
+    int N = 5000 * AVXBlockSize, gap;
+    __m256i * buffer = malloc(AVXBlockSize * sizeof(uint32_t));
+    uint32_t * datain = malloc(N * sizeof(uint32_t));
+    uint32_t * backbuffer = malloc(AVXBlockSize * sizeof(uint32_t));
+    for (gap = 1; gap <= 387420489; gap *= 3) {
+        int k;
+        printf(" gap = %u \n", gap);
+        for (k = 0; k < N; ++k)
+            datain[k] = k * gap;
+        for (k = 0; k * AVXBlockSize < N; ++k) {
+            /*
+               First part works for general arrays (sorted or unsorted)
+            */
+            int j;
+       	    /* we compute the bit width */
+            const uint32_t b = avxmaxbits(datain + k * AVXBlockSize);
+            if(avxmaxbits(datain + k * AVXBlockSize) != maxbits_length(datain + k * AVXBlockSize,AVXBlockSize)) {
+            	printf("avxmaxbits is buggy %d %d \n",
+            			avxmaxbits(datain + k * AVXBlockSize),
+						maxbits_length(datain + k * AVXBlockSize,AVXBlockSize));
+				return -1;
+            }
+            printf("bit width = %d\n",b);
+
+
+            /* we read 256 integers at "datain + k * AVXBlockSize" and
+               write b 256-bit vectors at "buffer" */
+            avxpackwithoutmask(datain + k * AVXBlockSize, buffer, b);
+            /* we read back b1 128-bit vectors at "buffer" and write 128 integers at backbuffer */
+			avxunpack(buffer, backbuffer, b);/* uncompressed */
+			for (j = 0; j < AVXBlockSize; ++j) {
+				if (backbuffer[j] != datain[k * AVXBlockSize + j]) {
+					int i;
+					printf("bug in avxpack\n");
+					for(i = 0; i < AVXBlockSize; ++i) {
+						printf("data[%d]=%d got back %d %s\n",i,
+								datain[k * AVXBlockSize + i],backbuffer[i],
+								datain[k * AVXBlockSize + i]!=backbuffer[i]?"bug":"");
+					}
+					return -2;
+				}
+			}
+        }
+    }
+    free(buffer);
+    free(datain);
+    free(backbuffer);
+    printf("Code looks good.\n");
+    return 0;
+}
+#endif /* avx2 */
+
+int test() {
+    int N = 5000 * SIMDBlockSize, gap;
+    __m128i * buffer = malloc(SIMDBlockSize * sizeof(uint32_t));
+    uint32_t * datain = malloc(N * sizeof(uint32_t));
+    uint32_t * backbuffer = malloc(SIMDBlockSize * sizeof(uint32_t));
+    for (gap = 1; gap <= 387420489; gap *= 3) {
+        int k;
+        printf(" gap = %u \n", gap);
+        for (k = 0; k < N; ++k)
+            datain[k] = k * gap;
+        for (k = 0; k * SIMDBlockSize < N; ++k) {
+            /*
+               First part works for general arrays (sorted or unsorted)
+            */
+            int j;
+       	    /* we compute the bit width */
+            const uint32_t b = maxbits(datain + k * SIMDBlockSize);
+            /* we read 128 integers at "datain + k * SIMDBlockSize" and
+               write b 128-bit vectors at "buffer" */
+            simdpackwithoutmask(datain + k * SIMDBlockSize, buffer, b);
+            /* we read back b1 128-bit vectors at "buffer" and write 128 integers at backbuffer */
+            simdunpack(buffer, backbuffer, b);/* uncompressed */
+            for (j = 0; j < SIMDBlockSize; ++j) {
+                if (backbuffer[j] != datain[k * SIMDBlockSize + j]) {
+                    printf("bug in simdpack\n");
+                    return -2;
+                }
+            }
+
+	    {
+                /*
+                 next part assumes that the data is sorted (uses differential coding)
+                */
+                uint32_t offset = 0;
+                /* we compute the bit width */
+                const uint32_t b1 = simdmaxbitsd1(offset,
+                    datain + k * SIMDBlockSize);
+               /* we read 128 integers at "datain + k * SIMDBlockSize" and
+                  write b1 128-bit vectors at "buffer" */
+               simdpackwithoutmaskd1(offset, datain + k * SIMDBlockSize, buffer,
+                    b1);
+               /* we read back b1 128-bit vectors at "buffer" and write 128 integers at backbuffer */
+               simdunpackd1(offset, buffer, backbuffer, b1);
+               for (j = 0; j < SIMDBlockSize; ++j) {
+                   if (backbuffer[j] != datain[k * SIMDBlockSize + j]) {
+                       printf("bug in simdpack d1\n");
+                       return -3;
+                   }
+               }
+               offset = datain[k * SIMDBlockSize + SIMDBlockSize - 1];
+	    }
+        }
+    }
+    free(buffer);
+    free(datain);
+    free(backbuffer);
+    printf("Code looks good.\n");
+    return 0;
+}
+
+#ifdef __SSE4_1__
+int testFOR() {
+    int N = 5000 * SIMDBlockSize, gap;
+    __m128i * buffer = malloc(SIMDBlockSize * sizeof(uint32_t));
+    uint32_t * datain = malloc(N * sizeof(uint32_t));
+    uint32_t * backbuffer = malloc(SIMDBlockSize * sizeof(uint32_t));
+    uint32_t tmax, tmin, tb;
+    for (gap = 1; gap <= 387420489; gap *= 2) {
+        int k;
+        printf(" gap = %u \n", gap);
+        for (k = 0; k < N; ++k)
+            datain[k] = k * gap;
+        for (k = 0; k * SIMDBlockSize < N; ++k) {
+            int j;
+            simdmaxmin_length(datain + k * SIMDBlockSize,SIMDBlockSize,&tmin,&tmax);
+       	    /* we compute the bit width */
+            tb  = bits(tmax - tmin);
+
+
+            /* we read 128 integers at "datain + k * SIMDBlockSize" and
+               write b 128-bit vectors at "buffer" */
+            simdpackFOR(tmin,datain + k * SIMDBlockSize, buffer, tb);
+
+            for (j = 0; j < SIMDBlockSize; ++j) {
+                        uint32_t selectedvalue = simdselectFOR(tmin,buffer,tb,j);
+                    	if (selectedvalue != datain[k * SIMDBlockSize + j]) {
+                            printf("bug in simdselectFOR\n");
+                            return -3;
+                        }
+            }
+            /* we read back b1 128-bit vectors at "buffer" and write 128 integers at backbuffer */
+            simdunpackFOR(tmin,buffer, backbuffer, tb);/* uncompressed */
+            for (j = 0; j < SIMDBlockSize; ++j) {
+            	if (backbuffer[j] != datain[k * SIMDBlockSize + j]) {
+                    printf("bug in simdpackFOR\n");
+                    return -2;
+                }
+            }
+        }
+    }
+    free(buffer);
+    free(datain);
+    free(backbuffer);
+    printf("Code looks good.\n");
+    return 0;
+}
+#endif
+
+#define MAX 300
+int test_simdmaxbitsd1_length() {
+    uint32_t result, buffer[MAX + 1];
+    int i, j;
+
+    memset(&buffer[0], 0xff, sizeof(buffer));
+
+    /* this test creates buffers of different length; each buffer is
+     * initialized to result in the following deltas:
+     * length 1: 2
+     * length 2: 1 2
+     * length 3: 1 1 2
+     * length 4: 1 1 1 2
+     * length 5: 1 1 1 1 2
+     * etc. Each sequence's "maxbits" is 2. */
+    for (i = 0; i < MAX; i++) {
+      for (j = 0; j < i; j++)
+        buffer[j] = j + 1;
+      buffer[i] = i + 2;
+
+      result = simdmaxbitsd1_length(0, &buffer[0], i + 1);
+      if (result != 2) {
+        printf("simdmaxbitsd1_length: unexpected result %u in loop %d\n",
+                result, i);
+        return -1;
+      }
+    }
+    printf("simdmaxbitsd1_length: ok\n");
+    return 0;
+}
+
+int uint32_cmp(const void *a, const void *b)
+{
+    const uint32_t *ia = (const uint32_t *)a;
+    const uint32_t *ib = (const uint32_t *)b;
+    if(*ia < *ib)
+    	return -1;
+    else if (*ia > *ib)
+    	return 1;
+    return 0;
+}
+
+#ifdef __SSE4_1__
+int test_simdpackedsearch() {
+    uint32_t buffer[128];
+    uint32_t result = 0;
+    int b, i;
+    uint32_t init = 0;
+    __m128i initial = _mm_set1_epi32(init);
+
+    /* initialize the buffer */
+    for (i = 0; i < 128; i++)
+        buffer[i] = (uint32_t)(i + 1);
+
+    /* this test creates delta encoded buffers with different bits, then
+     * performs lower bound searches for each key */
+    for (b = 1; b <= 32; b++) {
+        uint32_t out[128];
+        /* delta-encode to 'i' bits */
+        simdpackwithoutmaskd1(init, buffer, (__m128i *)out, b);
+        initial = _mm_setzero_si128();
+        printf("simdsearchd1: %d bits\n", b);
+
+        /* now perform the searches */
+        initial = _mm_set1_epi32(init);
+        assert(simdsearchd1(&initial, (__m128i *)out, b, 0, &result) == 0);
+        assert(result > 0);
+
+        for (i = 1; i <= 128; i++) {
+        	initial = _mm_set1_epi32(init);
+            assert(simdsearchd1(&initial, (__m128i *)out, b,
+                                    (uint32_t)i, &result) == i - 1);
+            assert(result == (unsigned)i);
+        }
+        initial = _mm_set1_epi32(init);
+        assert(simdsearchd1(&initial, (__m128i *)out, b, 200, &result)
+                        == 128);
+        assert(result > 200);
+    }
+    printf("simdsearchd1: ok\n");
+    return 0;
+}
+
+int test_simdpackedsearchFOR() {
+    uint32_t buffer[128];
+    uint32_t result = 0;
+    int b;
+    uint32_t i;
+    uint32_t maxv, tmin, tmax, tb;
+    uint32_t out[128];
+
+    /* this test creates delta encoded buffers with different bits, then
+     * performs lower bound searches for each key */
+    for (b = 1; b <= 32; b++) {
+        /* initialize the buffer */
+    	maxv = (b == 32)
+    			? 0xFFFFFFFF
+    					: ((1U<<b) - 1);
+        for (i = 0; i < 128; i++)
+            buffer[i] = maxv * (i + 1) / 128;
+        simdmaxmin_length(buffer,SIMDBlockSize,&tmin,&tmax);
+   	    /* we compute the bit width */
+        tb  = bits(tmax - tmin);
+        /* delta-encode to 'i' bits */
+        simdpackFOR(tmin, buffer, (__m128i *)out, tb);
+        printf("simdsearchd1: %d bits\n", b);
+
+        /* now perform the searches */
+        for (i = 0; i < 128; i++) {
+        	assert(buffer[i] == simdselectFOR(tmin, (__m128i *)out, tb,i));
+        }
+        for (i = 0; i < 128; i++) {
+            int x = simdsearchwithlengthFOR(tmin, (__m128i *)out, tb,
+                                    128,buffer[i], &result) ;
+            assert(simdselectFOR(tmin, (__m128i *)out, tb,x) == buffer[x]);
+            assert(simdselectFOR(tmin, (__m128i *)out, tb,x) == result);
+            assert(buffer[x] == result);
+            assert(result == buffer[i]);
+            assert(buffer[x] == buffer[i]);
+        }
+    }
+    printf("simdsearchFOR: ok\n");
+    return 0;
+}
+
+int test_simdpackedsearch_advanced() {
+    uint32_t buffer[128];
+    uint32_t backbuffer[128];
+	uint32_t out[128];
+    uint32_t result = 0;
+    uint32_t b, i;
+    uint32_t init = 0;
+    __m128i initial = _mm_set1_epi32(init);
+
+
+    /* this test creates delta encoded buffers with different bits, then
+     * performs lower bound searches for each key */
+    for (b = 0; b <= 32; b++) {
+    	uint32_t prev = init;
+        /* initialize the buffer */
+        for (i = 0; i < 128; i++) {
+            buffer[i] =  ((uint32_t)(1431655765 * i + 0xFFFFFFFF)) ;
+            if(b < 32) buffer[i] %= (1<<b);
+        }
+
+        qsort(buffer,128, sizeof(uint32_t), uint32_cmp);
+
+        for (i = 0; i < 128; i++) {
+           buffer[i] = buffer[i] + prev;
+           prev = buffer[i];
+        }
+        for (i = 1; i < 128; i++) {
+        	if(buffer[i] < buffer[i-1] )
+        		buffer[i] = buffer[i-1];
+        }
+        assert(simdmaxbitsd1(init, buffer)<=b);
+        for (i = 0; i < 128; i++) {
+        	out[i] = 0; /* memset would do too */
+        }
+
+        /* delta-encode to 'i' bits */
+        simdpackwithoutmaskd1(init, buffer, (__m128i *)out, b);
+        simdunpackd1(init,  (__m128i *)out, backbuffer, b);
+
+        for (i = 0; i < 128; i++) {
+        	assert(buffer[i] == backbuffer[i]);
+        }
+
+        printf("advanced simdsearchd1: %d bits\n", b);
+
+        for (i = 0; i < 128; i++) {
+        	int pos;
+            initial = _mm_set1_epi32(init);
+        	pos = simdsearchd1(&initial, (__m128i *)out, b,
+                    buffer[i], &result);
+        	assert(pos == simdsearchwithlengthd1(init, (__m128i *)out, b, 128,
+                    buffer[i], &result));
+        	assert(buffer[pos] == buffer[i]);
+            if(pos > 0)
+            	assert(buffer[pos - 1] < buffer[i]);
+            assert(result == buffer[i]);
+        }
+        for (i = 0; i < 128; i++) {
+        	int pos;
+        	if(buffer[i] == 0) continue;
+        	initial = _mm_set1_epi32(init);
+        	pos = simdsearchd1(&initial, (__m128i *)out, b,
+                    buffer[i] - 1, &result);
+        	assert(pos == simdsearchwithlengthd1(init, (__m128i *)out, b, 128,
+                    buffer[i] - 1, &result));
+        	assert(buffer[pos] >= buffer[i]  - 1);
+            if(pos > 0)
+            	assert(buffer[pos - 1] < buffer[i]  - 1);
+            assert(result == buffer[pos]);
+        }
+		for (i = 0; i < 128; i++) {
+			int pos;
+			if (buffer[i] + 1 == 0)
+				continue;
+			initial = _mm_set1_epi32(init);
+			pos = simdsearchd1(&initial, (__m128i *) out, b,
+					buffer[i] + 1, &result);
+			assert(pos == simdsearchwithlengthd1(init, (__m128i *)out, b, 128,
+                    buffer[i] + 1, &result));
+			if(pos == 128) {
+				assert(buffer[i] == buffer[127]);
+			} else {
+			  assert(buffer[pos] >= buffer[i] + 1);
+			  if (pos > 0)
+				assert(buffer[pos - 1] < buffer[i] + 1);
+			  assert(result == buffer[pos]);
+			}
+		}
+    }
+    printf("advanced simdsearchd1: ok\n");
+    return 0;
+}
+
+int test_simdpackedselect() {
+    uint32_t buffer[128];
+    uint32_t initial = 33;
+    int b, i;
+
+    /* initialize the buffer */
+    for (i = 0; i < 128; i++)
+        buffer[i] = (uint32_t)(initial + i);
+
+    /* this test creates delta encoded buffers with different bits, then
+     * performs lower bound searches for each key */
+    for (b = 1; b <= 32; b++) {
+        uint32_t out[128];
+        /* delta-encode to 'i' bits */
+        simdpackwithoutmaskd1(initial, buffer, (__m128i *)out, b);
+
+        printf("simdselectd1: %d bits\n", b);
+
+        /* now perform the searches */
+        for (i = 0; i < 128; i++) {
+            assert(simdselectd1(initial, (__m128i *)out, b, (uint32_t)i)
+                            == initial + i);
+        }
+    }
+    printf("simdselectd1: ok\n");
+    return 0;
+}
+
+int test_simdpackedselect_advanced() {
+    uint32_t buffer[128];
+    uint32_t initial = 33;
+    uint32_t b;
+    int i;
+
+    /* this test creates delta encoded buffers with different bits, then
+     * performs lower bound searches for each key */
+    for (b = 0; b <= 32; b++) {
+        uint32_t prev = initial;
+    	uint32_t out[128];
+        /* initialize the buffer */
+        for (i = 0; i < 128; i++) {
+            buffer[i] =  ((uint32_t)(165576 * i)) ;
+            if(b < 32) buffer[i] %= (1<<b);
+        }
+        for (i = 0; i < 128; i++) {
+           buffer[i] = buffer[i] + prev;
+           prev = buffer[i];
+        }
+
+        for (i = 1; i < 128; i++) {
+        	if(buffer[i] < buffer[i-1] )
+        		buffer[i] = buffer[i-1];
+        }
+        assert(simdmaxbitsd1(initial, buffer)<=b);
+
+        for (i = 0; i < 128; i++) {
+        	out[i] = 0; /* memset would do too */
+        }
+
+        /* delta-encode to 'i' bits */
+        simdpackwithoutmaskd1(initial, buffer, (__m128i *)out, b);
+
+        printf("simdselectd1: %d bits\n", b);
+
+        /* now perform the searches */
+        for (i = 0; i < 128; i++) {
+        	uint32_t valretrieved = simdselectd1(initial, (__m128i *)out, b, (uint32_t)i);
+            assert(valretrieved == buffer[i]);
+        }
+    }
+    printf("advanced simdselectd1: ok\n");
+    return 0;
+}
+#endif
+
+
+int main() {
+    int r;
+    r =  testsetFOR();
+    if (r) {
+         printf("test failure 1\n");
+         return r;
+    }
+
+#ifdef __SSE4_1__
+    r =  testsetd1();
+    if (r) {
+         printf("test failure 2\n");
+         return r;
+    }
+#endif
+    r =  testset();
+    if (r) {
+         printf("test failure 3\n");
+         return r;
+    }
+
+    r = testshortFORpack();
+    if (r) {
+         printf("test failure 4\n");
+         return r;
+    }
+    r = testshortpack();
+    if (r) {
+         printf("test failure 5\n");
+         return r;
+    }
+    r = testlongpack();
+    if (r) {
+         printf("test failure 6\n");
+         return r;
+    }
+#ifdef __SSE4_1__
+    r = test_simdpackedsearchFOR();
+    if (r) {
+         printf("test failure 7\n");
+         return r;
+    }
+
+    r = testFOR();
+    if (r) {
+         printf("test failure 8\n");
+         return r;
+    }
+#endif
+#ifdef __AVX2__
+    r= testbabyavx();
+    if (r) {
+         printf("test failure baby avx\n");
+         return r;
+    }
+
+    r = testavx2();
+    if (r) {
+         printf("test failure 9 avx\n");
+         return r;
+    }
+#endif
+    r = test();
+    if (r) {
+         printf("test failure 9\n");
+         return r;
+    }
+
+    r = test_simdmaxbitsd1_length();
+    if (r) {
+         printf("test failure 10\n");
+         return r;
+    }
+#ifdef __SSE4_1__
+    r = test_simdpackedsearch();
+    if (r) {
+         printf("test failure 11\n");
+         return r;
+    }
+
+    r = test_simdpackedsearch_advanced();
+    if (r) {
+         printf("test failure 12\n");
+         return r;
+    }
+
+    r = test_simdpackedselect();
+    if (r) {
+         printf("test failure 13\n");
+         return r;
+    }
+
+    r = test_simdpackedselect_advanced();
+    if (r) {
+         printf("test failure 14\n");
+         return r;
+    }
+#endif
+    printf("All tests OK!\n");
+
+
+    return 0;
+}
--- a/cpp/simdcomp/tests/unit_chars.c
+++ b/cpp/simdcomp/tests/unit_chars.c
@@ -0,0 +1,102 @@
+/**
+ * This code is released under a BSD License.
+ */
+#include <stdio.h>
+#include <stdlib.h>
+#include <time.h>
+#include "simdcomp.h"
+
+
+#define get_random_char() (uint8_t)(rand() % 256);
+
+
+int main() {
+    int N = 5000 * SIMDBlockSize, gap;
+    __m128i * buffer = malloc(SIMDBlockSize * sizeof(uint32_t));
+    uint32_t * datain = malloc(N * sizeof(uint32_t));
+    uint32_t * backbuffer = malloc(SIMDBlockSize * sizeof(uint32_t));
+
+    srand(time(NULL));
+
+    for (gap = 1; gap <= 387420489; gap *= 3) {
+        int k;
+        printf(" gap = %u \n", gap);
+
+    /* simulate some random character string, don't care about endiannes */
+        for (k = 0; k < N; ++k) {
+        uint8_t _tmp[4];
+ 
+            _tmp[0] = get_random_char();
+            _tmp[1] = get_random_char();
+            _tmp[2] = get_random_char();
+            _tmp[3] = get_random_char();
+
+            memmove(&datain[k], _tmp, 4);
+        }
+        for (k = 0; k * SIMDBlockSize < N; ++k) {
+            /*
+               First part works for general arrays (sorted or unsorted)
+            */
+            int j;
+               /* we compute the bit width */
+            const uint32_t b = maxbits(datain + k * SIMDBlockSize);
+            /* we read 128 integers at "datain + k * SIMDBlockSize" and
+               write b 128-bit vectors at "buffer" */
+            simdpackwithoutmask(datain + k * SIMDBlockSize, buffer, b);
+            /* we read back b1 128-bit vectors at "buffer" and write 128 integers at backbuffer */
+            simdunpack(buffer, backbuffer, b);/* uncompressed */
+            for (j = 0; j < SIMDBlockSize; ++j) {
+                uint8_t chars_back[4];
+                uint8_t chars_in[4];
+
+                memmove(chars_back, &backbuffer[j], 4);
+                memmove(chars_in, &datain[k * SIMDBlockSize + j], 4);
+
+                if (chars_in[0] != chars_back[0]
+                    || chars_in[1] != chars_back[1]
+                    || chars_in[2] != chars_back[2]
+                    || chars_in[3] != chars_back[3]) {
+                    printf("bug in simdpack\n");
+                    return -2;
+                }
+            }
+
+            {
+                /*
+                 next part assumes that the data is sorted (uses differential coding)
+                */
+                uint32_t offset = 0;
+                /* we compute the bit width */
+                const uint32_t b1 = simdmaxbitsd1(offset,
+                datain + k * SIMDBlockSize);
+                   /* we read 128 integers at "datain + k * SIMDBlockSize" and
+                  write b1 128-bit vectors at "buffer" */
+                   simdpackwithoutmaskd1(offset, datain + k * SIMDBlockSize, buffer,
+                b1);
+                   /* we read back b1 128-bit vectors at "buffer" and write 128 integers at backbuffer */
+                   simdunpackd1(offset, buffer, backbuffer, b1);
+                for (j = 0; j < SIMDBlockSize; ++j) {
+                    uint8_t chars_back[4];
+                    uint8_t chars_in[4];
+
+                    memmove(chars_back, &backbuffer[j], 4);
+                    memmove(chars_in, &datain[k * SIMDBlockSize + j], 4);
+
+                    if (chars_in[0] != chars_back[0]
+                        || chars_in[1] != chars_back[1]
+                        || chars_in[2] != chars_back[2]
+                        || chars_in[3] != chars_back[3]) {
+                        printf("bug in simdpack\n");
+                        return -3;
+                    }
+                }
+                offset = datain[k * SIMDBlockSize + SIMDBlockSize - 1];
+            }
+        }
+    }
+    free(buffer);
+    free(datain);
+    free(backbuffer);
+    printf("Code looks good.\n");
+    return 0;
+}
--- a/cpp/simdcomp_wrapper.c
+++ b/cpp/simdcomp_wrapper.c
@@ -0,0 +1,42 @@
+#include "simdcomp.h"
+#include "simdcomputil.h"
+
+// assumes datain has a size of 128 uint32
+// and that buffer is large enough to host the data.
+size_t compress_sorted(
+        const uint32_t* datain,
+        uint8_t* output,
+        const uint32_t offset) {
+    const uint32_t b = simdmaxbitsd1(offset, datain);
+    *output++ = b;
+    simdpackwithoutmaskd1(offset, datain, (__m128i *) output,  b);
+    return 1 + b * sizeof(__m128i);
+}
+
+// assumes datain has a size of 128 uint32
+// and that buffer is large enough to host the data.
+size_t uncompress_sorted(
+        const uint8_t* compressed_data, 
+        uint32_t* output, 
+        uint32_t offset) {
+    const uint32_t b = *compressed_data++;
+    simdunpackd1(offset, (__m128i *)compressed_data, output, b);
+    return 1 + b * sizeof(__m128i);
+}
+
+size_t compress_unsorted(
+        const uint32_t* datain,
+        uint8_t* output) {
+    const uint32_t b = maxbits(datain);
+    *output++ = b;
+    simdpackwithoutmask(datain, (__m128i *) output,  b);
+    return 1 + b * sizeof(__m128i);
+}
+
+size_t uncompress_unsorted(
+        const uint8_t* compressed_data, 
+        uint32_t* output) {
+    const uint32_t b = *compressed_data++;
+    simdunpack((__m128i *)compressed_data, output, b);
+    return 1 + b * sizeof(__m128i);
+}
--- a/cpp/streamvbyte/.gitignore
+++ b/cpp/streamvbyte/.gitignore
@@ -0,0 +1,32 @@
+# Object files
+*.o
+*.ko
+*.obj
+*.elf
+
+# Precompiled Headers
+*.gch
+*.pch
+
+# Libraries
+*.lib
+*.a
+*.la
+*.lo
+
+# Shared objects (inc. Windows DLLs)
+*.dll
+*.so
+*.so.*
+*.dylib
+
+# Executables
+*.exe
+*.out
+*.app
+*.i*86
+*.x86_64
+*.hex
+
+# Debug files
+*.dSYM/
--- a/cpp/streamvbyte/.travis.yml
+++ b/cpp/streamvbyte/.travis.yml
@@ -0,0 +1,7 @@
+language: c
+sudo: false
+compiler:
+  - gcc
+  - clang
+
+script: make && ./unit
--- a/cpp/streamvbyte/LICENSE
+++ b/cpp/streamvbyte/LICENSE
@@ -0,0 +1,202 @@
+Apache License
+                           Version 2.0, January 2004
+                        http://www.apache.org/licenses/
+
+   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+
+   1. Definitions.
+
+      "License" shall mean the terms and conditions for use, reproduction,
+      and distribution as defined by Sections 1 through 9 of this document.
+
+      "Licensor" shall mean the copyright owner or entity authorized by
+      the copyright owner that is granting the License.
+
+      "Legal Entity" shall mean the union of the acting entity and all
+      other entities that control, are controlled by, or are under common
+      control with that entity. For the purposes of this definition,
+      "control" means (i) the power, direct or indirect, to cause the
+      direction or management of such entity, whether by contract or
+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
+      outstanding shares, or (iii) beneficial ownership of such entity.
+
+      "You" (or "Your") shall mean an individual or Legal Entity
+      exercising permissions granted by this License.
+
+      "Source" form shall mean the preferred form for making modifications,
+      including but not limited to software source code, documentation
+      source, and configuration files.
+
+      "Object" form shall mean any form resulting from mechanical
+      transformation or translation of a Source form, including but
+      not limited to compiled object code, generated documentation,
+      and conversions to other media types.
+
+      "Work" shall mean the work of authorship, whether in Source or
+      Object form, made available under the License, as indicated by a
+      copyright notice that is included in or attached to the work
+      (an example is provided in the Appendix below).
+
+      "Derivative Works" shall mean any work, whether in Source or Object
+      form, that is based on (or derived from) the Work and for which the
+      editorial revisions, annotations, elaborations, or other modifications
+      represent, as a whole, an original work of authorship. For the purposes
+      of this License, Derivative Works shall not include works that remain
+      separable from, or merely link (or bind by name) to the interfaces of,
+      the Work and Derivative Works thereof.
+
+      "Contribution" shall mean any work of authorship, including
+      the original version of the Work and any modifications or additions
+      to that Work or Derivative Works thereof, that is intentionally
+      submitted to Licensor for inclusion in the Work by the copyright owner
+      or by an individual or Legal Entity authorized to submit on behalf of
+      the copyright owner. For the purposes of this definition, "submitted"
+      means any form of electronic, verbal, or written communication sent
+      to the Licensor or its representatives, including but not limited to
+      communication on electronic mailing lists, source code control systems,
+      and issue tracking systems that are managed by, or on behalf of, the
+      Licensor for the purpose of discussing and improving the Work, but
+      excluding communication that is conspicuously marked or otherwise
+      designated in writing by the copyright owner as "Not a Contribution."
+
+      "Contributor" shall mean Licensor and any individual or Legal Entity
+      on behalf of whom a Contribution has been received by Licensor and
+      subsequently incorporated within the Work.
+
+   2. Grant of Copyright License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      copyright license to reproduce, prepare Derivative Works of,
+      publicly display, publicly perform, sublicense, and distribute the
+      Work and such Derivative Works in Source or Object form.
+
+   3. Grant of Patent License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      (except as stated in this section) patent license to make, have made,
+      use, offer to sell, sell, import, and otherwise transfer the Work,
+      where such license applies only to those patent claims licensable
+      by such Contributor that are necessarily infringed by their
+      Contribution(s) alone or by combination of their Contribution(s)
+      with the Work to which such Contribution(s) was submitted. If You
+      institute patent litigation against any entity (including a
+      cross-claim or counterclaim in a lawsuit) alleging that the Work
+      or a Contribution incorporated within the Work constitutes direct
+      or contributory patent infringement, then any patent licenses
+      granted to You under this License for that Work shall terminate
+      as of the date such litigation is filed.
+
+   4. Redistribution. You may reproduce and distribute copies of the
+      Work or Derivative Works thereof in any medium, with or without
+      modifications, and in Source or Object form, provided that You
+      meet the following conditions:
+
+      (a) You must give any other recipients of the Work or
+          Derivative Works a copy of this License; and
+
+      (b) You must cause any modified files to carry prominent notices
+          stating that You changed the files; and
+
+      (c) You must retain, in the Source form of any Derivative Works
+          that You distribute, all copyright, patent, trademark, and
+          attribution notices from the Source form of the Work,
+          excluding those notices that do not pertain to any part of
+          the Derivative Works; and
+
+      (d) If the Work includes a "NOTICE" text file as part of its
+          distribution, then any Derivative Works that You distribute must
+          include a readable copy of the attribution notices contained
+          within such NOTICE file, excluding those notices that do not
+          pertain to any part of the Derivative Works, in at least one
+          of the following places: within a NOTICE text file distributed
+          as part of the Derivative Works; within the Source form or
+          documentation, if provided along with the Derivative Works; or,
+          within a display generated by the Derivative Works, if and
+          wherever such third-party notices normally appear. The contents
+          of the NOTICE file are for informational purposes only and
+          do not modify the License. You may add Your own attribution
+          notices within Derivative Works that You distribute, alongside
+          or as an addendum to the NOTICE text from the Work, provided
+          that such additional attribution notices cannot be construed
+          as modifying the License.
+
+      You may add Your own copyright statement to Your modifications and
+      may provide additional or different license terms and conditions
+      for use, reproduction, or distribution of Your modifications, or
+      for any such Derivative Works as a whole, provided Your use,
+      reproduction, and distribution of the Work otherwise complies with
+      the conditions stated in this License.
+
+   5. Submission of Contributions. Unless You explicitly state otherwise,
+      any Contribution intentionally submitted for inclusion in the Work
+      by You to the Licensor shall be under the terms and conditions of
+      this License, without any additional terms or conditions.
+      Notwithstanding the above, nothing herein shall supersede or modify
+      the terms of any separate license agreement you may have executed
+      with Licensor regarding such Contributions.
+
+   6. Trademarks. This License does not grant permission to use the trade
+      names, trademarks, service marks, or product names of the Licensor,
+      except as required for reasonable and customary use in describing the
+      origin of the Work and reproducing the content of the NOTICE file.
+
+   7. Disclaimer of Warranty. Unless required by applicable law or
+      agreed to in writing, Licensor provides the Work (and each
+      Contributor provides its Contributions) on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied, including, without limitation, any warranties or conditions
+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+      PARTICULAR PURPOSE. You are solely responsible for determining the
+      appropriateness of using or redistributing the Work and assume any
+      risks associated with Your exercise of permissions under this License.
+
+   8. Limitation of Liability. In no event and under no legal theory,
+      whether in tort (including negligence), contract, or otherwise,
+      unless required by applicable law (such as deliberate and grossly
+      negligent acts) or agreed to in writing, shall any Contributor be
+      liable to You for damages, including any direct, indirect, special,
+      incidental, or consequential damages of any character arising as a
+      result of this License or out of the use or inability to use the
+      Work (including but not limited to damages for loss of goodwill,
+      work stoppage, computer failure or malfunction, or any and all
+      other commercial damages or losses), even if such Contributor
+      has been advised of the possibility of such damages.
+
+   9. Accepting Warranty or Additional Liability. While redistributing
+      the Work or Derivative Works thereof, You may choose to offer,
+      and charge a fee for, acceptance of support, warranty, indemnity,
+      or other liability obligations and/or rights consistent with this
+      License. However, in accepting such obligations, You may act only
+      on Your own behalf and on Your sole responsibility, not on behalf
+      of any other Contributor, and only if You agree to indemnify,
+      defend, and hold each Contributor harmless for any liability
+      incurred by, or claims asserted against, such Contributor by reason
+      of your accepting any such warranty or additional liability.
+
+   END OF TERMS AND CONDITIONS
+
+   APPENDIX: How to apply the Apache License to your work.
+
+      To apply the Apache License to your work, attach the following
+      boilerplate notice, with the fields enclosed by brackets "{}"
+      replaced with your own identifying information. (Don't include
+      the brackets!)  The text should be enclosed in the appropriate
+      comment syntax for the file format. We also recommend that a
+      file or class name and description of purpose be included on the
+      same "printed page" as the copyright notice for easier
+      identification within third-party archives.
+
+   Copyright {yyyy} {name of copyright owner}
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+
--- a/cpp/streamvbyte/README.md
+++ b/cpp/streamvbyte/README.md
@@ -0,0 +1,60 @@
+streamvbyte
+===========
+[![Build Status](https://travis-ci.org/lemire/streamvbyte.png)](https://travis-ci.org/lemire/streamvbyte)
+
+StreamVByte is a new integer compression technique that applies SIMD instructions (vectorization) to
+Google's Group Varint approach. The net result is faster than other byte-oriented compression
+techniques.
+
+The approach is patent-free, the code is available under the Apache License.
+
+
+It includes fast differential coding.
+
+It assumes a recent Intel processor (e.g., haswell or better) .
+
+The code should build using most standard-compliant C99 compilers. The provided makefile
+expects a Linux-like system.
+
+
+Usage:
+
+      make
+      ./unit
+
+See example.c for an example.
+
+Short code sample:
+```C
+// suppose that datain is an array of uint32_t integers
+size_t compsize = streamvbyte_encode(datain, N, compressedbuffer); // encoding
+// here the result is stored in compressedbuffer using compsize bytes
+streamvbyte_decode(compressedbuffer, recovdata, N); // decoding (fast)
+```
+
+If the values are sorted, then it might be preferable to use differential coding:
+```C
+// suppose that datain is an array of uint32_t integers
+size_t compsize = streamvbyte_delta_encode(datain, N, compressedbuffer,0); // encoding
+// here the result is stored in compressedbuffer using compsize bytes
+streamvbyte_delta_decode(compressedbuffer, recovdata, N,0); // decoding (fast)
+```
+You have to know how many integers were coded when you decompress. You can store this 
+information along with the compressed stream.
+
+See also
+--------
+* SIMDCompressionAndIntersection: A C++ library to compress and intersect sorted lists of integers using SIMD instructions https://github.com/lemire/SIMDCompressionAndIntersect
+* The FastPFOR C++ library : Fast integer compression https://github.com/lemire/FastPFor
+* High-performance dictionary coding https://github.com/lemire/dictionary
+* LittleIntPacker: C library to pack and unpack short arrays of integers as fast as possible https://github.com/lemire/LittleIntPacker
+* The SIMDComp library: A simple C library for compressing lists of integers using binary packing https://github.com/lemire/simdcomp
+* MaskedVByte: Fast decoder for VByte-compressed integers https://github.com/lemire/MaskedVByte
+* CSharpFastPFOR: A C#  integer compression library  https://github.com/Genbox/CSharpFastPFOR
+* JavaFastPFOR: A java integer compression library https://github.com/lemire/JavaFastPFOR
+* Encoding: Integer Compression Libraries for Go https://github.com/zhenjl/encoding
+* FrameOfReference is a C++ library dedicated to frame-of-reference (FOR) compression: https://github.com/lemire/FrameOfReference
+* libvbyte: A fast implementation for varbyte 32bit/64bit integer compression https://github.com/cruppstahl/libvbyte
+* TurboPFor is a C library that offers lots of interesting optimizations. Well worth checking! (GPL license) https://github.com/powturbo/TurboPFor
+* Oroch is a C++ library that offers a usable API (MIT license) https://github.com/ademakov/Oroch
+
--- a/cpp/streamvbyte/example.c
+++ b/cpp/streamvbyte/example.c
@@ -0,0 +1,24 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <assert.h>
+
+#include "streamvbyte.h"
+
+int main() {
+	int N = 5000;
+	uint32_t * datain = malloc(N * sizeof(uint32_t));
+	uint8_t * compressedbuffer = malloc(N * sizeof(uint32_t));
+	uint32_t * recovdata = malloc(N * sizeof(uint32_t));
+	for (int k = 0; k < N; ++k)
+		datain[k] = 120;
+	size_t compsize = streamvbyte_encode(datain, N, compressedbuffer); // encoding
+	// here the result is stored in compressedbuffer using compsize bytes
+	size_t compsize2 = streamvbyte_decode(compressedbuffer, recovdata,
+					N); // decoding (fast)
+	assert(compsize == compsize2);
+	free(datain);
+	free(compressedbuffer);
+	free(recovdata);
+	printf("Compressed %d integers down to %d bytes.\n",N,(int) compsize);
+	return 0;
+}
--- a/cpp/streamvbyte/include/streamvbyte.h
+++ b/cpp/streamvbyte/include/streamvbyte.h
@@ -0,0 +1,19 @@
+
+#ifndef VARINTDECODE_H_
+#define VARINTDECODE_H_
+#define __STDC_FORMAT_MACROS
+#include <inttypes.h>
+#include <stdint.h>// please use a C99-compatible compiler
+#include <stddef.h>
+
+
+// Encode an array of a given length read from in to bout in varint format.
+// Returns the number of bytes written.
+size_t streamvbyte_encode(const uint32_t *in, uint32_t length, uint8_t *out);
+
+// Read "length" 32-bit integers in varint format from in, storing the result in out.
+// Returns the number of bytes read.
+size_t streamvbyte_decode(const uint8_t* in, uint32_t* out, uint32_t length);
+
+
+#endif /* VARINTDECODE_H_ */
--- a/cpp/streamvbyte/include/streamvbytedelta.h
+++ b/cpp/streamvbyte/include/streamvbytedelta.h
@@ -0,0 +1,24 @@
+/*
+ * streamvbytedelta.h
+ *
+ *  Created on: Apr 14, 2016
+ *      Author: lemire
+ */
+
+#ifndef INCLUDE_STREAMVBYTEDELTA_H_
+#define INCLUDE_STREAMVBYTEDELTA_H_
+
+
+// Encode an array of a given length read from in to bout in StreamVByte format.
+// Returns the number of bytes written.
+// this version uses differential coding (coding differences between values) starting at prev (you can often set prev to zero)
+size_t streamvbyte_delta_encode(const uint32_t *in, uint32_t length, uint8_t *out, uint32_t  prev);
+
+// Read "length" 32-bit integers in StreamVByte format from in, storing the result in out.
+// Returns the number of bytes read.
+// this version uses differential coding (coding differences between values) starting at prev (you can often set prev to zero)
+size_t streamvbyte_delta_decode(const uint8_t* in, uint32_t* out, uint32_t length, uint32_t  prev);
+
+
+
+#endif /* INCLUDE_STREAMVBYTEDELTA_H_ */
--- a/cpp/streamvbyte/makefile
+++ b/cpp/streamvbyte/makefile
@@ -0,0 +1,58 @@
+# minimalist makefile
+.SUFFIXES:
+#
+.SUFFIXES: .cpp .o .c .h
+
+CFLAGS = -fPIC -march=native -std=c99 -O3 -Wall -Wextra -pedantic -Wshadow
+LDFLAGS = -shared
+LIBNAME=libstreamvbyte.so.0.0.1
+all:  unit $(LIBNAME)
+test:
+	./unit
+install: $(OBJECTS)
+	cp $(LIBNAME) /usr/local/lib
+	ln -s /usr/local/lib/$(LIBNAME) /usr/local/lib/libstreamvbyte.so
+	ldconfig
+	cp $(HEADERS) /usr/local/include
+
+
+
+HEADERS=./include/streamvbyte.h ./include/streamvbytedelta.h 
+
+uninstall:
+	for h in $(HEADERS) ; do rm  /usr/local/$$h; done
+	rm  /usr/local/lib/$(LIBNAME)
+	rm /usr/local/lib/libstreamvbyte.so
+	ldconfig
+
+
+OBJECTS= streamvbyte.o streamvbytedelta.o
+
+
+
+streamvbytedelta.o: ./src/streamvbytedelta.c $(HEADERS)
+	$(CC) $(CFLAGS) -c ./src/streamvbytedelta.c -Iinclude
+
+
+streamvbyte.o: ./src/streamvbyte.c $(HEADERS)
+	$(CC) $(CFLAGS) -c ./src/streamvbyte.c -Iinclude
+
+
+
+$(LIBNAME): $(OBJECTS)
+	$(CC) $(CFLAGS) -o $(LIBNAME) $(OBJECTS)  $(LDFLAGS)
+
+
+
+
+example: ./example.c    $(HEADERS) $(OBJECTS)
+	$(CC) $(CFLAGS) -o example ./example.c -Iinclude  $(OBJECTS)
+
+unit: ./tests/unit.c    $(HEADERS) $(OBJECTS)
+	$(CC) $(CFLAGS) -o unit ./tests/unit.c -Iinclude  $(OBJECTS)
+
+dynunit: ./tests/unit.c    $(HEADERS) $(LIBNAME)
+	$(CC) $(CFLAGS) -o dynunit ./tests/unit.c -Iinclude  -lstreamvbyte
+
+clean:
+	rm -f unit *.o $(LIBNAME) example
--- a/cpp/streamvbyte/src/streamvbyte.c
+++ b/cpp/streamvbyte/src/streamvbyte.c
@@ -0,0 +1,495 @@
+#include "streamvbyte.h"
+#if defined(_MSC_VER)
+     /* Microsoft C/C++-compatible compiler */
+     #include <intrin.h>
+#elif defined(__GNUC__) && (defined(__x86_64__) || defined(__i386__))
+     /* GCC-compatible compiler, targeting x86/x86-64 */
+     #include <x86intrin.h>
+#elif defined(__GNUC__) && defined(__ARM_NEON__)
+     /* GCC-compatible compiler, targeting ARM with NEON */
+     #include <arm_neon.h>
+#elif defined(__GNUC__) && defined(__IWMMXT__)
+     /* GCC-compatible compiler, targeting ARM with WMMX */
+     #include <mmintrin.h>
+#elif (defined(__GNUC__) || defined(__xlC__)) && (defined(__VEC__) || defined(__ALTIVEC__))
+     /* XLC or GCC-compatible compiler, targeting PowerPC with VMX/VSX */
+     #include <altivec.h>
+#elif defined(__GNUC__) && defined(__SPE__)
+     /* GCC-compatible compiler, targeting PowerPC with SPE */
+     #include <spe.h>
+#endif
+
+static uint8_t lengthTable[256] = { 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9,
+		10, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10, 8, 9, 10, 11, 6, 7, 8, 9, 7, 8,
+		9, 10, 8, 9, 10, 11, 9, 10, 11, 12, 7, 8, 9, 10, 8, 9, 10, 11, 9, 10,
+		11, 12, 10, 11, 12, 13, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10, 8, 9, 10,
+		11, 6, 7, 8, 9, 7, 8, 9, 10, 8, 9, 10, 11, 9, 10, 11, 12, 7, 8, 9, 10,
+		8, 9, 10, 11, 9, 10, 11, 12, 10, 11, 12, 13, 8, 9, 10, 11, 9, 10, 11,
+		12, 10, 11, 12, 13, 11, 12, 13, 14, 6, 7, 8, 9, 7, 8, 9, 10, 8, 9, 10,
+		11, 9, 10, 11, 12, 7, 8, 9, 10, 8, 9, 10, 11, 9, 10, 11, 12, 10, 11, 12,
+		13, 8, 9, 10, 11, 9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14, 9, 10,
+		11, 12, 10, 11, 12, 13, 11, 12, 13, 14, 12, 13, 14, 15, 7, 8, 9, 10, 8,
+		9, 10, 11, 9, 10, 11, 12, 10, 11, 12, 13, 8, 9, 10, 11, 9, 10, 11, 12,
+		10, 11, 12, 13, 11, 12, 13, 14, 9, 10, 11, 12, 10, 11, 12, 13, 11, 12,
+		13, 14, 12, 13, 14, 15, 10, 11, 12, 13, 11, 12, 13, 14, 12, 13, 14, 15,
+		13, 14, 15, 16 };
+
+static uint8_t shuffleTable[256][16] = { { 0, -1, -1, -1, 1, -1, -1, -1, 2, -1,
+		-1, -1, 3, -1, -1, -1 }, // 1111
+		{ 0, 1, -1, -1, 2, -1, -1, -1, 3, -1, -1, -1, 4, -1, -1, -1 },  // 2111
+		{ 0, 1, 2, -1, 3, -1, -1, -1, 4, -1, -1, -1, 5, -1, -1, -1 },   // 3111
+		{ 0, 1, 2, 3, 4, -1, -1, -1, 5, -1, -1, -1, 6, -1, -1, -1 },    // 4111
+		{ 0, -1, -1, -1, 1, 2, -1, -1, 3, -1, -1, -1, 4, -1, -1, -1 },  // 1211
+		{ 0, 1, -1, -1, 2, 3, -1, -1, 4, -1, -1, -1, 5, -1, -1, -1 },   // 2211
+		{ 0, 1, 2, -1, 3, 4, -1, -1, 5, -1, -1, -1, 6, -1, -1, -1 },    // 3211
+		{ 0, 1, 2, 3, 4, 5, -1, -1, 6, -1, -1, -1, 7, -1, -1, -1 },     // 4211
+		{ 0, -1, -1, -1, 1, 2, 3, -1, 4, -1, -1, -1, 5, -1, -1, -1 },   // 1311
+		{ 0, 1, -1, -1, 2, 3, 4, -1, 5, -1, -1, -1, 6, -1, -1, -1 },    // 2311
+		{ 0, 1, 2, -1, 3, 4, 5, -1, 6, -1, -1, -1, 7, -1, -1, -1 },     // 3311
+		{ 0, 1, 2, 3, 4, 5, 6, -1, 7, -1, -1, -1, 8, -1, -1, -1 },      // 4311
+		{ 0, -1, -1, -1, 1, 2, 3, 4, 5, -1, -1, -1, 6, -1, -1, -1 },    // 1411
+		{ 0, 1, -1, -1, 2, 3, 4, 5, 6, -1, -1, -1, 7, -1, -1, -1 },     // 2411
+		{ 0, 1, 2, -1, 3, 4, 5, 6, 7, -1, -1, -1, 8, -1, -1, -1 },      // 3411
+		{ 0, 1, 2, 3, 4, 5, 6, 7, 8, -1, -1, -1, 9, -1, -1, -1 },       // 4411
+		{ 0, -1, -1, -1, 1, -1, -1, -1, 2, 3, -1, -1, 4, -1, -1, -1 },  // 1121
+		{ 0, 1, -1, -1, 2, -1, -1, -1, 3, 4, -1, -1, 5, -1, -1, -1 },   // 2121
+		{ 0, 1, 2, -1, 3, -1, -1, -1, 4, 5, -1, -1, 6, -1, -1, -1 },    // 3121
+		{ 0, 1, 2, 3, 4, -1, -1, -1, 5, 6, -1, -1, 7, -1, -1, -1 },     // 4121
+		{ 0, -1, -1, -1, 1, 2, -1, -1, 3, 4, -1, -1, 5, -1, -1, -1 },   // 1221
+		{ 0, 1, -1, -1, 2, 3, -1, -1, 4, 5, -1, -1, 6, -1, -1, -1 },    // 2221
+		{ 0, 1, 2, -1, 3, 4, -1, -1, 5, 6, -1, -1, 7, -1, -1, -1 },     // 3221
+		{ 0, 1, 2, 3, 4, 5, -1, -1, 6, 7, -1, -1, 8, -1, -1, -1 },      // 4221
+		{ 0, -1, -1, -1, 1, 2, 3, -1, 4, 5, -1, -1, 6, -1, -1, -1 },    // 1321
+		{ 0, 1, -1, -1, 2, 3, 4, -1, 5, 6, -1, -1, 7, -1, -1, -1 },     // 2321
+		{ 0, 1, 2, -1, 3, 4, 5, -1, 6, 7, -1, -1, 8, -1, -1, -1 },      // 3321
+		{ 0, 1, 2, 3, 4, 5, 6, -1, 7, 8, -1, -1, 9, -1, -1, -1 },       // 4321
+		{ 0, -1, -1, -1, 1, 2, 3, 4, 5, 6, -1, -1, 7, -1, -1, -1 },     // 1421
+		{ 0, 1, -1, -1, 2, 3, 4, 5, 6, 7, -1, -1, 8, -1, -1, -1 },      // 2421
+		{ 0, 1, 2, -1, 3, 4, 5, 6, 7, 8, -1, -1, 9, -1, -1, -1 },       // 3421
+		{ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, -1, -1, 10, -1, -1, -1 },       // 4421
+		{ 0, -1, -1, -1, 1, -1, -1, -1, 2, 3, 4, -1, 5, -1, -1, -1 },   // 1131
+		{ 0, 1, -1, -1, 2, -1, -1, -1, 3, 4, 5, -1, 6, -1, -1, -1 },    // 2131
+		{ 0, 1, 2, -1, 3, -1, -1, -1, 4, 5, 6, -1, 7, -1, -1, -1 },     // 3131
+		{ 0, 1, 2, 3, 4, -1, -1, -1, 5, 6, 7, -1, 8, -1, -1, -1 },      // 4131
+		{ 0, -1, -1, -1, 1, 2, -1, -1, 3, 4, 5, -1, 6, -1, -1, -1 },    // 1231
+		{ 0, 1, -1, -1, 2, 3, -1, -1, 4, 5, 6, -1, 7, -1, -1, -1 },     // 2231
+		{ 0, 1, 2, -1, 3, 4, -1, -1, 5, 6, 7, -1, 8, -1, -1, -1 },      // 3231
+		{ 0, 1, 2, 3, 4, 5, -1, -1, 6, 7, 8, -1, 9, -1, -1, -1 },       // 4231
+		{ 0, -1, -1, -1, 1, 2, 3, -1, 4, 5, 6, -1, 7, -1, -1, -1 },     // 1331
+		{ 0, 1, -1, -1, 2, 3, 4, -1, 5, 6, 7, -1, 8, -1, -1, -1 },      // 2331
+		{ 0, 1, 2, -1, 3, 4, 5, -1, 6, 7, 8, -1, 9, -1, -1, -1 },       // 3331
+		{ 0, 1, 2, 3, 4, 5, 6, -1, 7, 8, 9, -1, 10, -1, -1, -1 },       // 4331
+		{ 0, -1, -1, -1, 1, 2, 3, 4, 5, 6, 7, -1, 8, -1, -1, -1 },      // 1431
+		{ 0, 1, -1, -1, 2, 3, 4, 5, 6, 7, 8, -1, 9, -1, -1, -1 },       // 2431
+		{ 0, 1, 2, -1, 3, 4, 5, 6, 7, 8, 9, -1, 10, -1, -1, -1 },       // 3431
+		{ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, -1, 11, -1, -1, -1 },       // 4431
+		{ 0, -1, -1, -1, 1, -1, -1, -1, 2, 3, 4, 5, 6, -1, -1, -1 },    // 1141
+		{ 0, 1, -1, -1, 2, -1, -1, -1, 3, 4, 5, 6, 7, -1, -1, -1 },     // 2141
+		{ 0, 1, 2, -1, 3, -1, -1, -1, 4, 5, 6, 7, 8, -1, -1, -1 },      // 3141
+		{ 0, 1, 2, 3, 4, -1, -1, -1, 5, 6, 7, 8, 9, -1, -1, -1 },       // 4141
+		{ 0, -1, -1, -1, 1, 2, -1, -1, 3, 4, 5, 6, 7, -1, -1, -1 },     // 1241
+		{ 0, 1, -1, -1, 2, 3, -1, -1, 4, 5, 6, 7, 8, -1, -1, -1 },      // 2241
+		{ 0, 1, 2, -1, 3, 4, -1, -1, 5, 6, 7, 8, 9, -1, -1, -1 },       // 3241
+		{ 0, 1, 2, 3, 4, 5, -1, -1, 6, 7, 8, 9, 10, -1, -1, -1 },       // 4241
+		{ 0, -1, -1, -1, 1, 2, 3, -1, 4, 5, 6, 7, 8, -1, -1, -1 },      // 1341
+		{ 0, 1, -1, -1, 2, 3, 4, -1, 5, 6, 7, 8, 9, -1, -1, -1 },       // 2341
+		{ 0, 1, 2, -1, 3, 4, 5, -1, 6, 7, 8, 9, 10, -1, -1, -1 },       // 3341
+		{ 0, 1, 2, 3, 4, 5, 6, -1, 7, 8, 9, 10, 11, -1, -1, -1 },       // 4341
+		{ 0, -1, -1, -1, 1, 2, 3, 4, 5, 6, 7, 8, 9, -1, -1, -1 },       // 1441
+		{ 0, 1, -1, -1, 2, 3, 4, 5, 6, 7, 8, 9, 10, -1, -1, -1 },       // 2441
+		{ 0, 1, 2, -1, 3, 4, 5, 6, 7, 8, 9, 10, 11, -1, -1, -1 },       // 3441
+		{ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, -1, -1, -1 },       // 4441
+		{ 0, -1, -1, -1, 1, -1, -1, -1, 2, -1, -1, -1, 3, 4, -1, -1 },  // 1112
+		{ 0, 1, -1, -1, 2, -1, -1, -1, 3, -1, -1, -1, 4, 5, -1, -1 },   // 2112
+		{ 0, 1, 2, -1, 3, -1, -1, -1, 4, -1, -1, -1, 5, 6, -1, -1 },    // 3112
+		{ 0, 1, 2, 3, 4, -1, -1, -1, 5, -1, -1, -1, 6, 7, -1, -1 },     // 4112
+		{ 0, -1, -1, -1, 1, 2, -1, -1, 3, -1, -1, -1, 4, 5, -1, -1 },   // 1212
+		{ 0, 1, -1, -1, 2, 3, -1, -1, 4, -1, -1, -1, 5, 6, -1, -1 },    // 2212
+		{ 0, 1, 2, -1, 3, 4, -1, -1, 5, -1, -1, -1, 6, 7, -1, -1 },     // 3212
+		{ 0, 1, 2, 3, 4, 5, -1, -1, 6, -1, -1, -1, 7, 8, -1, -1 },      // 4212
+		{ 0, -1, -1, -1, 1, 2, 3, -1, 4, -1, -1, -1, 5, 6, -1, -1 },    // 1312
+		{ 0, 1, -1, -1, 2, 3, 4, -1, 5, -1, -1, -1, 6, 7, -1, -1 },     // 2312
+		{ 0, 1, 2, -1, 3, 4, 5, -1, 6, -1, -1, -1, 7, 8, -1, -1 },      // 3312
+		{ 0, 1, 2, 3, 4, 5, 6, -1, 7, -1, -1, -1, 8, 9, -1, -1 },       // 4312
+		{ 0, -1, -1, -1, 1, 2, 3, 4, 5, -1, -1, -1, 6, 7, -1, -1 },     // 1412
+		{ 0, 1, -1, -1, 2, 3, 4, 5, 6, -1, -1, -1, 7, 8, -1, -1 },      // 2412
+		{ 0, 1, 2, -1, 3, 4, 5, 6, 7, -1, -1, -1, 8, 9, -1, -1 },       // 3412
+		{ 0, 1, 2, 3, 4, 5, 6, 7, 8, -1, -1, -1, 9, 10, -1, -1 },       // 4412
+		{ 0, -1, -1, -1, 1, -1, -1, -1, 2, 3, -1, -1, 4, 5, -1, -1 },   // 1122
+		{ 0, 1, -1, -1, 2, -1, -1, -1, 3, 4, -1, -1, 5, 6, -1, -1 },    // 2122
+		{ 0, 1, 2, -1, 3, -1, -1, -1, 4, 5, -1, -1, 6, 7, -1, -1 },     // 3122
+		{ 0, 1, 2, 3, 4, -1, -1, -1, 5, 6, -1, -1, 7, 8, -1, -1 },      // 4122
+		{ 0, -1, -1, -1, 1, 2, -1, -1, 3, 4, -1, -1, 5, 6, -1, -1 },    // 1222
+		{ 0, 1, -1, -1, 2, 3, -1, -1, 4, 5, -1, -1, 6, 7, -1, -1 },     // 2222
+		{ 0, 1, 2, -1, 3, 4, -1, -1, 5, 6, -1, -1, 7, 8, -1, -1 },      // 3222
+		{ 0, 1, 2, 3, 4, 5, -1, -1, 6, 7, -1, -1, 8, 9, -1, -1 },       // 4222
+		{ 0, -1, -1, -1, 1, 2, 3, -1, 4, 5, -1, -1, 6, 7, -1, -1 },     // 1322
+		{ 0, 1, -1, -1, 2, 3, 4, -1, 5, 6, -1, -1, 7, 8, -1, -1 },      // 2322
+		{ 0, 1, 2, -1, 3, 4, 5, -1, 6, 7, -1, -1, 8, 9, -1, -1 },       // 3322
+		{ 0, 1, 2, 3, 4, 5, 6, -1, 7, 8, -1, -1, 9, 10, -1, -1 },       // 4322
+		{ 0, -1, -1, -1, 1, 2, 3, 4, 5, 6, -1, -1, 7, 8, -1, -1 },      // 1422
+		{ 0, 1, -1, -1, 2, 3, 4, 5, 6, 7, -1, -1, 8, 9, -1, -1 },       // 2422
+		{ 0, 1, 2, -1, 3, 4, 5, 6, 7, 8, -1, -1, 9, 10, -1, -1 },       // 3422
+		{ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, -1, -1, 10, 11, -1, -1 },       // 4422
+		{ 0, -1, -1, -1, 1, -1, -1, -1, 2, 3, 4, -1, 5, 6, -1, -1 },    // 1132
+		{ 0, 1, -1, -1, 2, -1, -1, -1, 3, 4, 5, -1, 6, 7, -1, -1 },     // 2132
+		{ 0, 1, 2, -1, 3, -1, -1, -1, 4, 5, 6, -1, 7, 8, -1, -1 },      // 3132
+		{ 0, 1, 2, 3, 4, -1, -1, -1, 5, 6, 7, -1, 8, 9, -1, -1 },       // 4132
+		{ 0, -1, -1, -1, 1, 2, -1, -1, 3, 4, 5, -1, 6, 7, -1, -1 },     // 1232
+		{ 0, 1, -1, -1, 2, 3, -1, -1, 4, 5, 6, -1, 7, 8, -1, -1 },      // 2232
+		{ 0, 1, 2, -1, 3, 4, -1, -1, 5, 6, 7, -1, 8, 9, -1, -1 },       // 3232
+		{ 0, 1, 2, 3, 4, 5, -1, -1, 6, 7, 8, -1, 9, 10, -1, -1 },       // 4232
+		{ 0, -1, -1, -1, 1, 2, 3, -1, 4, 5, 6, -1, 7, 8, -1, -1 },      // 1332
+		{ 0, 1, -1, -1, 2, 3, 4, -1, 5, 6, 7, -1, 8, 9, -1, -1 },       // 2332
+		{ 0, 1, 2, -1, 3, 4, 5, -1, 6, 7, 8, -1, 9, 10, -1, -1 },       // 3332
+		{ 0, 1, 2, 3, 4, 5, 6, -1, 7, 8, 9, -1, 10, 11, -1, -1 },       // 4332
+		{ 0, -1, -1, -1, 1, 2, 3, 4, 5, 6, 7, -1, 8, 9, -1, -1 },       // 1432
+		{ 0, 1, -1, -1, 2, 3, 4, 5, 6, 7, 8, -1, 9, 10, -1, -1 },       // 2432
+		{ 0, 1, 2, -1, 3, 4, 5, 6, 7, 8, 9, -1, 10, 11, -1, -1 },       // 3432
+		{ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, -1, 11, 12, -1, -1 },       // 4432
+		{ 0, -1, -1, -1, 1, -1, -1, -1, 2, 3, 4, 5, 6, 7, -1, -1 },     // 1142
+		{ 0, 1, -1, -1, 2, -1, -1, -1, 3, 4, 5, 6, 7, 8, -1, -1 },      // 2142
+		{ 0, 1, 2, -1, 3, -1, -1, -1, 4, 5, 6, 7, 8, 9, -1, -1 },       // 3142
+		{ 0, 1, 2, 3, 4, -1, -1, -1, 5, 6, 7, 8, 9, 10, -1, -1 },       // 4142
+		{ 0, -1, -1, -1, 1, 2, -1, -1, 3, 4, 5, 6, 7, 8, -1, -1 },      // 1242
+		{ 0, 1, -1, -1, 2, 3, -1, -1, 4, 5, 6, 7, 8, 9, -1, -1 },       // 2242
+		{ 0, 1, 2, -1, 3, 4, -1, -1, 5, 6, 7, 8, 9, 10, -1, -1 },       // 3242
+		{ 0, 1, 2, 3, 4, 5, -1, -1, 6, 7, 8, 9, 10, 11, -1, -1 },       // 4242
+		{ 0, -1, -1, -1, 1, 2, 3, -1, 4, 5, 6, 7, 8, 9, -1, -1 },       // 1342
+		{ 0, 1, -1, -1, 2, 3, 4, -1, 5, 6, 7, 8, 9, 10, -1, -1 },       // 2342
+		{ 0, 1, 2, -1, 3, 4, 5, -1, 6, 7, 8, 9, 10, 11, -1, -1 },       // 3342
+		{ 0, 1, 2, 3, 4, 5, 6, -1, 7, 8, 9, 10, 11, 12, -1, -1 },       // 4342
+		{ 0, -1, -1, -1, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, -1, -1 },       // 1442
+		{ 0, 1, -1, -1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, -1, -1 },       // 2442
+		{ 0, 1, 2, -1, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, -1, -1 },       // 3442
+		{ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, -1, -1 },       // 4442
+		{ 0, -1, -1, -1, 1, -1, -1, -1, 2, -1, -1, -1, 3, 4, 5, -1 },   // 1113
+		{ 0, 1, -1, -1, 2, -1, -1, -1, 3, -1, -1, -1, 4, 5, 6, -1 },    // 2113
+		{ 0, 1, 2, -1, 3, -1, -1, -1, 4, -1, -1, -1, 5, 6, 7, -1 },     // 3113
+		{ 0, 1, 2, 3, 4, -1, -1, -1, 5, -1, -1, -1, 6, 7, 8, -1 },      // 4113
+		{ 0, -1, -1, -1, 1, 2, -1, -1, 3, -1, -1, -1, 4, 5, 6, -1 },    // 1213
+		{ 0, 1, -1, -1, 2, 3, -1, -1, 4, -1, -1, -1, 5, 6, 7, -1 },     // 2213
+		{ 0, 1, 2, -1, 3, 4, -1, -1, 5, -1, -1, -1, 6, 7, 8, -1 },      // 3213
+		{ 0, 1, 2, 3, 4, 5, -1, -1, 6, -1, -1, -1, 7, 8, 9, -1 },       // 4213
+		{ 0, -1, -1, -1, 1, 2, 3, -1, 4, -1, -1, -1, 5, 6, 7, -1 },     // 1313
+		{ 0, 1, -1, -1, 2, 3, 4, -1, 5, -1, -1, -1, 6, 7, 8, -1 },      // 2313
+		{ 0, 1, 2, -1, 3, 4, 5, -1, 6, -1, -1, -1, 7, 8, 9, -1 },       // 3313
+		{ 0, 1, 2, 3, 4, 5, 6, -1, 7, -1, -1, -1, 8, 9, 10, -1 },       // 4313
+		{ 0, -1, -1, -1, 1, 2, 3, 4, 5, -1, -1, -1, 6, 7, 8, -1 },      // 1413
+		{ 0, 1, -1, -1, 2, 3, 4, 5, 6, -1, -1, -1, 7, 8, 9, -1 },       // 2413
+		{ 0, 1, 2, -1, 3, 4, 5, 6, 7, -1, -1, -1, 8, 9, 10, -1 },       // 3413
+		{ 0, 1, 2, 3, 4, 5, 6, 7, 8, -1, -1, -1, 9, 10, 11, -1 },       // 4413
+		{ 0, -1, -1, -1, 1, -1, -1, -1, 2, 3, -1, -1, 4, 5, 6, -1 },    // 1123
+		{ 0, 1, -1, -1, 2, -1, -1, -1, 3, 4, -1, -1, 5, 6, 7, -1 },     // 2123
+		{ 0, 1, 2, -1, 3, -1, -1, -1, 4, 5, -1, -1, 6, 7, 8, -1 },      // 3123
+		{ 0, 1, 2, 3, 4, -1, -1, -1, 5, 6, -1, -1, 7, 8, 9, -1 },       // 4123
+		{ 0, -1, -1, -1, 1, 2, -1, -1, 3, 4, -1, -1, 5, 6, 7, -1 },     // 1223
+		{ 0, 1, -1, -1, 2, 3, -1, -1, 4, 5, -1, -1, 6, 7, 8, -1 },      // 2223
+		{ 0, 1, 2, -1, 3, 4, -1, -1, 5, 6, -1, -1, 7, 8, 9, -1 },       // 3223
+		{ 0, 1, 2, 3, 4, 5, -1, -1, 6, 7, -1, -1, 8, 9, 10, -1 },       // 4223
+		{ 0, -1, -1, -1, 1, 2, 3, -1, 4, 5, -1, -1, 6, 7, 8, -1 },      // 1323
+		{ 0, 1, -1, -1, 2, 3, 4, -1, 5, 6, -1, -1, 7, 8, 9, -1 },       // 2323
+		{ 0, 1, 2, -1, 3, 4, 5, -1, 6, 7, -1, -1, 8, 9, 10, -1 },       // 3323
+		{ 0, 1, 2, 3, 4, 5, 6, -1, 7, 8, -1, -1, 9, 10, 11, -1 },       // 4323
+		{ 0, -1, -1, -1, 1, 2, 3, 4, 5, 6, -1, -1, 7, 8, 9, -1 },       // 1423
+		{ 0, 1, -1, -1, 2, 3, 4, 5, 6, 7, -1, -1, 8, 9, 10, -1 },       // 2423
+		{ 0, 1, 2, -1, 3, 4, 5, 6, 7, 8, -1, -1, 9, 10, 11, -1 },       // 3423
+		{ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, -1, -1, 10, 11, 12, -1 },       // 4423
+		{ 0, -1, -1, -1, 1, -1, -1, -1, 2, 3, 4, -1, 5, 6, 7, -1 },     // 1133
+		{ 0, 1, -1, -1, 2, -1, -1, -1, 3, 4, 5, -1, 6, 7, 8, -1 },      // 2133
+		{ 0, 1, 2, -1, 3, -1, -1, -1, 4, 5, 6, -1, 7, 8, 9, -1 },       // 3133
+		{ 0, 1, 2, 3, 4, -1, -1, -1, 5, 6, 7, -1, 8, 9, 10, -1 },       // 4133
+		{ 0, -1, -1, -1, 1, 2, -1, -1, 3, 4, 5, -1, 6, 7, 8, -1 },      // 1233
+		{ 0, 1, -1, -1, 2, 3, -1, -1, 4, 5, 6, -1, 7, 8, 9, -1 },       // 2233
+		{ 0, 1, 2, -1, 3, 4, -1, -1, 5, 6, 7, -1, 8, 9, 10, -1 },       // 3233
+		{ 0, 1, 2, 3, 4, 5, -1, -1, 6, 7, 8, -1, 9, 10, 11, -1 },       // 4233
+		{ 0, -1, -1, -1, 1, 2, 3, -1, 4, 5, 6, -1, 7, 8, 9, -1 },       // 1333
+		{ 0, 1, -1, -1, 2, 3, 4, -1, 5, 6, 7, -1, 8, 9, 10, -1 },       // 2333
+		{ 0, 1, 2, -1, 3, 4, 5, -1, 6, 7, 8, -1, 9, 10, 11, -1 },       // 3333
+		{ 0, 1, 2, 3, 4, 5, 6, -1, 7, 8, 9, -1, 10, 11, 12, -1 },       // 4333
+		{ 0, -1, -1, -1, 1, 2, 3, 4, 5, 6, 7, -1, 8, 9, 10, -1 },       // 1433
+		{ 0, 1, -1, -1, 2, 3, 4, 5, 6, 7, 8, -1, 9, 10, 11, -1 },       // 2433
+		{ 0, 1, 2, -1, 3, 4, 5, 6, 7, 8, 9, -1, 10, 11, 12, -1 },       // 3433
+		{ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, -1, 11, 12, 13, -1 },       // 4433
+		{ 0, -1, -1, -1, 1, -1, -1, -1, 2, 3, 4, 5, 6, 7, 8, -1 },      // 1143
+		{ 0, 1, -1, -1, 2, -1, -1, -1, 3, 4, 5, 6, 7, 8, 9, -1 },       // 2143
+		{ 0, 1, 2, -1, 3, -1, -1, -1, 4, 5, 6, 7, 8, 9, 10, -1 },       // 3143
+		{ 0, 1, 2, 3, 4, -1, -1, -1, 5, 6, 7, 8, 9, 10, 11, -1 },       // 4143
+		{ 0, -1, -1, -1, 1, 2, -1, -1, 3, 4, 5, 6, 7, 8, 9, -1 },       // 1243
+		{ 0, 1, -1, -1, 2, 3, -1, -1, 4, 5, 6, 7, 8, 9, 10, -1 },       // 2243
+		{ 0, 1, 2, -1, 3, 4, -1, -1, 5, 6, 7, 8, 9, 10, 11, -1 },       // 3243
+		{ 0, 1, 2, 3, 4, 5, -1, -1, 6, 7, 8, 9, 10, 11, 12, -1 },       // 4243
+		{ 0, -1, -1, -1, 1, 2, 3, -1, 4, 5, 6, 7, 8, 9, 10, -1 },       // 1343
+		{ 0, 1, -1, -1, 2, 3, 4, -1, 5, 6, 7, 8, 9, 10, 11, -1 },       // 2343
+		{ 0, 1, 2, -1, 3, 4, 5, -1, 6, 7, 8, 9, 10, 11, 12, -1 },       // 3343
+		{ 0, 1, 2, 3, 4, 5, 6, -1, 7, 8, 9, 10, 11, 12, 13, -1 },       // 4343
+		{ 0, -1, -1, -1, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, -1 },       // 1443
+		{ 0, 1, -1, -1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, -1 },       // 2443
+		{ 0, 1, 2, -1, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, -1 },       // 3443
+		{ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, -1 },       // 4443
+		{ 0, -1, -1, -1, 1, -1, -1, -1, 2, -1, -1, -1, 3, 4, 5, 6 },    // 1114
+		{ 0, 1, -1, -1, 2, -1, -1, -1, 3, -1, -1, -1, 4, 5, 6, 7 },     // 2114
+		{ 0, 1, 2, -1, 3, -1, -1, -1, 4, -1, -1, -1, 5, 6, 7, 8 },      // 3114
+		{ 0, 1, 2, 3, 4, -1, -1, -1, 5, -1, -1, -1, 6, 7, 8, 9 },       // 4114
+		{ 0, -1, -1, -1, 1, 2, -1, -1, 3, -1, -1, -1, 4, 5, 6, 7 },     // 1214
+		{ 0, 1, -1, -1, 2, 3, -1, -1, 4, -1, -1, -1, 5, 6, 7, 8 },      // 2214
+		{ 0, 1, 2, -1, 3, 4, -1, -1, 5, -1, -1, -1, 6, 7, 8, 9 },       // 3214
+		{ 0, 1, 2, 3, 4, 5, -1, -1, 6, -1, -1, -1, 7, 8, 9, 10 },       // 4214
+		{ 0, -1, -1, -1, 1, 2, 3, -1, 4, -1, -1, -1, 5, 6, 7, 8 },      // 1314
+		{ 0, 1, -1, -1, 2, 3, 4, -1, 5, -1, -1, -1, 6, 7, 8, 9 },       // 2314
+		{ 0, 1, 2, -1, 3, 4, 5, -1, 6, -1, -1, -1, 7, 8, 9, 10 },       // 3314
+		{ 0, 1, 2, 3, 4, 5, 6, -1, 7, -1, -1, -1, 8, 9, 10, 11 },       // 4314
+		{ 0, -1, -1, -1, 1, 2, 3, 4, 5, -1, -1, -1, 6, 7, 8, 9 },       // 1414
+		{ 0, 1, -1, -1, 2, 3, 4, 5, 6, -1, -1, -1, 7, 8, 9, 10 },       // 2414
+		{ 0, 1, 2, -1, 3, 4, 5, 6, 7, -1, -1, -1, 8, 9, 10, 11 },       // 3414
+		{ 0, 1, 2, 3, 4, 5, 6, 7, 8, -1, -1, -1, 9, 10, 11, 12 },       // 4414
+		{ 0, -1, -1, -1, 1, -1, -1, -1, 2, 3, -1, -1, 4, 5, 6, 7 },     // 1124
+		{ 0, 1, -1, -1, 2, -1, -1, -1, 3, 4, -1, -1, 5, 6, 7, 8 },      // 2124
+		{ 0, 1, 2, -1, 3, -1, -1, -1, 4, 5, -1, -1, 6, 7, 8, 9 },       // 3124
+		{ 0, 1, 2, 3, 4, -1, -1, -1, 5, 6, -1, -1, 7, 8, 9, 10 },       // 4124
+		{ 0, -1, -1, -1, 1, 2, -1, -1, 3, 4, -1, -1, 5, 6, 7, 8 },      // 1224
+		{ 0, 1, -1, -1, 2, 3, -1, -1, 4, 5, -1, -1, 6, 7, 8, 9 },       // 2224
+		{ 0, 1, 2, -1, 3, 4, -1, -1, 5, 6, -1, -1, 7, 8, 9, 10 },       // 3224
+		{ 0, 1, 2, 3, 4, 5, -1, -1, 6, 7, -1, -1, 8, 9, 10, 11 },       // 4224
+		{ 0, -1, -1, -1, 1, 2, 3, -1, 4, 5, -1, -1, 6, 7, 8, 9 },       // 1324
+		{ 0, 1, -1, -1, 2, 3, 4, -1, 5, 6, -1, -1, 7, 8, 9, 10 },       // 2324
+		{ 0, 1, 2, -1, 3, 4, 5, -1, 6, 7, -1, -1, 8, 9, 10, 11 },       // 3324
+		{ 0, 1, 2, 3, 4, 5, 6, -1, 7, 8, -1, -1, 9, 10, 11, 12 },       // 4324
+		{ 0, -1, -1, -1, 1, 2, 3, 4, 5, 6, -1, -1, 7, 8, 9, 10 },       // 1424
+		{ 0, 1, -1, -1, 2, 3, 4, 5, 6, 7, -1, -1, 8, 9, 10, 11 },       // 2424
+		{ 0, 1, 2, -1, 3, 4, 5, 6, 7, 8, -1, -1, 9, 10, 11, 12 },       // 3424
+		{ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, -1, -1, 10, 11, 12, 13 },       // 4424
+		{ 0, -1, -1, -1, 1, -1, -1, -1, 2, 3, 4, -1, 5, 6, 7, 8 },      // 1134
+		{ 0, 1, -1, -1, 2, -1, -1, -1, 3, 4, 5, -1, 6, 7, 8, 9 },       // 2134
+		{ 0, 1, 2, -1, 3, -1, -1, -1, 4, 5, 6, -1, 7, 8, 9, 10 },       // 3134
+		{ 0, 1, 2, 3, 4, -1, -1, -1, 5, 6, 7, -1, 8, 9, 10, 11 },       // 4134
+		{ 0, -1, -1, -1, 1, 2, -1, -1, 3, 4, 5, -1, 6, 7, 8, 9 },       // 1234
+		{ 0, 1, -1, -1, 2, 3, -1, -1, 4, 5, 6, -1, 7, 8, 9, 10 },       // 2234
+		{ 0, 1, 2, -1, 3, 4, -1, -1, 5, 6, 7, -1, 8, 9, 10, 11 },       // 3234
+		{ 0, 1, 2, 3, 4, 5, -1, -1, 6, 7, 8, -1, 9, 10, 11, 12 },       // 4234
+		{ 0, -1, -1, -1, 1, 2, 3, -1, 4, 5, 6, -1, 7, 8, 9, 10 },       // 1334
+		{ 0, 1, -1, -1, 2, 3, 4, -1, 5, 6, 7, -1, 8, 9, 10, 11 },       // 2334
+		{ 0, 1, 2, -1, 3, 4, 5, -1, 6, 7, 8, -1, 9, 10, 11, 12 },       // 3334
+		{ 0, 1, 2, 3, 4, 5, 6, -1, 7, 8, 9, -1, 10, 11, 12, 13 },       // 4334
+		{ 0, -1, -1, -1, 1, 2, 3, 4, 5, 6, 7, -1, 8, 9, 10, 11 },       // 1434
+		{ 0, 1, -1, -1, 2, 3, 4, 5, 6, 7, 8, -1, 9, 10, 11, 12 },       // 2434
+		{ 0, 1, 2, -1, 3, 4, 5, 6, 7, 8, 9, -1, 10, 11, 12, 13 },       // 3434
+		{ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, -1, 11, 12, 13, 14 },       // 4434
+		{ 0, -1, -1, -1, 1, -1, -1, -1, 2, 3, 4, 5, 6, 7, 8, 9 },       // 1144
+		{ 0, 1, -1, -1, 2, -1, -1, -1, 3, 4, 5, 6, 7, 8, 9, 10 },       // 2144
+		{ 0, 1, 2, -1, 3, -1, -1, -1, 4, 5, 6, 7, 8, 9, 10, 11 },       // 3144
+		{ 0, 1, 2, 3, 4, -1, -1, -1, 5, 6, 7, 8, 9, 10, 11, 12 },       // 4144
+		{ 0, -1, -1, -1, 1, 2, -1, -1, 3, 4, 5, 6, 7, 8, 9, 10 },       // 1244
+		{ 0, 1, -1, -1, 2, 3, -1, -1, 4, 5, 6, 7, 8, 9, 10, 11 },       // 2244
+		{ 0, 1, 2, -1, 3, 4, -1, -1, 5, 6, 7, 8, 9, 10, 11, 12 },       // 3244
+		{ 0, 1, 2, 3, 4, 5, -1, -1, 6, 7, 8, 9, 10, 11, 12, 13 },       // 4244
+		{ 0, -1, -1, -1, 1, 2, 3, -1, 4, 5, 6, 7, 8, 9, 10, 11 },       // 1344
+		{ 0, 1, -1, -1, 2, 3, 4, -1, 5, 6, 7, 8, 9, 10, 11, 12 },       // 2344
+		{ 0, 1, 2, -1, 3, 4, 5, -1, 6, 7, 8, 9, 10, 11, 12, 13 },       // 3344
+		{ 0, 1, 2, 3, 4, 5, 6, -1, 7, 8, 9, 10, 11, 12, 13, 14 },       // 4344
+		{ 0, -1, -1, -1, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12 },       // 1444
+		{ 0, 1, -1, -1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13 },       // 2444
+		{ 0, 1, 2, -1, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14 },       // 3444
+		{ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 }        // 4444
+};
+
+static uint8_t _encode_data(uint32_t val, uint8_t *__restrict__ *dataPtrPtr) {
+	uint8_t *dataPtr = *dataPtrPtr;
+	uint8_t code;
+
+	if (val < (1 << 8)) { // 1 byte
+		*dataPtr = (uint8_t)(val);
+		*dataPtrPtr += 1;
+		code = 0;
+	} else if (val < (1 << 16)) { // 2 bytes
+		*(uint16_t *) dataPtr = (uint16_t)(val);
+		*dataPtrPtr += 2;
+		code = 1;
+	} else if (val < (1 << 24)) { // 3 bytes
+		*(uint16_t *) dataPtr = (uint16_t)(val);
+		*(dataPtr + 2) = (uint8_t)(val >> 16);
+		*dataPtrPtr += 3;
+		code = 2;
+	} else { // 4 bytes
+		*(uint32_t *) dataPtr = val;
+		*dataPtrPtr += 4;
+		code = 3;
+	}
+
+	return code;
+}
+
+static uint8_t *svb_encode_scalar(const uint32_t *in,
+		uint8_t *__restrict__ keyPtr, uint8_t *__restrict__ dataPtr,
+		uint32_t count) {
+	if (count == 0)
+		return dataPtr; // exit immediately if no data
+
+	uint8_t shift = 0; // cycles 0, 2, 4, 6, 0, 2, 4, 6, ...
+	uint8_t key = 0;
+	for (uint32_t c = 0; c < count; c++) {
+		if (shift == 8) {
+			shift = 0;
+			*keyPtr++ = key;
+			key = 0;
+		}
+		uint32_t val = in[c];
+		uint8_t code = _encode_data(val, &dataPtr);
+		key |= code << shift;
+		shift += 2;
+	}
+
+	*keyPtr = key;  // write last key (no increment needed)
+	return dataPtr; // pointer to first unused data byte
+}
+
+// Encode an array of a given length read from in to bout in streamvbyte format.
+// Returns the number of bytes written.
+size_t streamvbyte_encode(const uint32_t *in, uint32_t count, uint8_t *out) {
+	uint8_t *keyPtr = out;
+	uint32_t keyLen = (count + 3) / 4; // 2-bits rounded to full byte
+	uint8_t *dataPtr = keyPtr + keyLen; // variable byte data after all keys
+	return svb_encode_scalar(in, keyPtr, dataPtr, count) - out;
+}
+
+static inline __m128i _decode_avx(uint32_t key,
+		const uint8_t *__restrict__ *dataPtrPtr) {
+	uint8_t len = lengthTable[key];
+	__m128i Data = _mm_loadu_si128((__m128i *) *dataPtrPtr);
+	__m128i Shuf = *(__m128i *) &shuffleTable[key];
+
+	Data = _mm_shuffle_epi8(Data, Shuf);
+	*dataPtrPtr += len;
+	return Data;
+}
+
+static inline void _write_avx(uint32_t *out, __m128i Vec) {
+	_mm_storeu_si128((__m128i *) out, Vec);
+}
+
+static inline uint32_t _decode_data(const uint8_t **dataPtrPtr, uint8_t code) {
+	const uint8_t *dataPtr = *dataPtrPtr;
+	uint32_t val;
+
+	if (code == 0) { // 1 byte
+		val = (uint32_t) * dataPtr;
+		dataPtr += 1;
+	} else if (code == 1) { // 2 bytes
+		val = (uint32_t) * (uint16_t *) dataPtr;
+		dataPtr += 2;
+	} else if (code == 2) { // 3 bytes
+		val = (uint32_t) * (uint16_t *) dataPtr;
+		val |= *(dataPtr + 2) << 16;
+		dataPtr += 3;
+	} else {                      // code == 3
+		val = *(uint32_t *) dataPtr; // 4 bytes
+		dataPtr += 4;
+	}
+
+	*dataPtrPtr = dataPtr;
+	return val;
+}
+static const uint8_t *svb_decode_scalar(uint32_t *outPtr, const uint8_t *keyPtr,
+		const uint8_t *dataPtr, uint32_t count) {
+	if (count == 0)
+		return dataPtr; // no reads or writes if no data
+
+	uint8_t shift = 0;
+	uint32_t key = *keyPtr++;
+	for (uint32_t c = 0; c < count; c++) {
+		if (shift == 8) {
+			shift = 0;
+			key = *keyPtr++;
+		}
+		uint32_t val = _decode_data(&dataPtr, (key >> shift) & 0x3);
+		*outPtr++ = val;
+		shift += 2;
+	}
+
+	return dataPtr; // pointer to first unused byte after end
+}
+
+const uint8_t *svb_decode_avx_simple(uint32_t *out,
+		const uint8_t *__restrict__ keyPtr, const uint8_t *__restrict__ dataPtr,
+		uint64_t count) {
+
+	uint64_t keybytes = count / 4; // number of key bytes
+	__m128i Data;
+	if (keybytes >= 8) {
+
+		int64_t Offset = -(int64_t) keybytes / 8 + 1;
+
+		const uint64_t *keyPtr64 = (const uint64_t *) keyPtr - Offset;
+		uint64_t nextkeys = keyPtr64[Offset];
+		for (; Offset != 0; ++Offset) {
+			uint64_t keys = nextkeys;
+			nextkeys = keyPtr64[Offset + 1];
+
+			Data = _decode_avx((keys & 0xFF), &dataPtr);
+			_write_avx(out, Data);
+			Data = _decode_avx((keys & 0xFF00) >> 8, &dataPtr);
+			_write_avx(out + 4, Data);
+
+			keys >>= 16;
+			Data = _decode_avx((keys & 0xFF), &dataPtr);
+			_write_avx(out + 8, Data);
+			Data = _decode_avx((keys & 0xFF00) >> 8, &dataPtr);
+			_write_avx(out + 12, Data);
+
+			keys >>= 16;
+			Data = _decode_avx((keys & 0xFF), &dataPtr);
+			_write_avx(out + 16, Data);
+			Data = _decode_avx((keys & 0xFF00) >> 8, &dataPtr);
+			_write_avx(out + 20, Data);
+
+			keys >>= 16;
+			Data = _decode_avx((keys & 0xFF), &dataPtr);
+			_write_avx(out + 24, Data);
+			Data = _decode_avx((keys & 0xFF00) >> 8, &dataPtr);
+			_write_avx(out + 28, Data);
+
+			out += 32;
+		}
+		{
+			uint64_t keys = nextkeys;
+
+			Data = _decode_avx((keys & 0xFF), &dataPtr);
+			_write_avx(out, Data);
+			Data = _decode_avx((keys & 0xFF00) >> 8, &dataPtr);
+			_write_avx(out + 4, Data);
+
+			keys >>= 16;
+			Data = _decode_avx((keys & 0xFF), &dataPtr);
+			_write_avx(out + 8, Data);
+			Data = _decode_avx((keys & 0xFF00) >> 8, &dataPtr);
+			_write_avx(out + 12, Data);
+
+			keys >>= 16;
+			Data = _decode_avx((keys & 0xFF), &dataPtr);
+			_write_avx(out + 16, Data);
+			Data = _decode_avx((keys & 0xFF00) >> 8, &dataPtr);
+			_write_avx(out + 20, Data);
+
+			keys >>= 16;
+			Data = _decode_avx((keys & 0xFF), &dataPtr);
+			_write_avx(out + 24, Data);
+			Data = _decode_avx((keys & 0xFF00) >> 8, &dataPtr);
+			_write_avx(out + 28, Data);
+
+			out += 32;
+		}
+	}
+	uint64_t consumedkeys = keybytes - (keybytes & 7);
+	return svb_decode_scalar(out, keyPtr + consumedkeys, dataPtr, count & 31);
+}
+
+// Read count 32-bit integers in maskedvbyte format from in, storing the result in out.  Returns the number of bytes read.
+size_t streamvbyte_decode(const uint8_t* in, uint32_t* out, uint32_t count) {
+	if (count == 0)
+		return 0;
+	const uint8_t *keyPtr = in;            // full list of keys is next
+	uint32_t keyLen = ((count + 3) / 4); // 2-bits per key (rounded up)
+	const uint8_t *dataPtr = keyPtr + keyLen;  // data starts at end of keys
+	return svb_decode_avx_simple(out, keyPtr, dataPtr, count) - in;
+
+}
--- a/cpp/streamvbyte/src/streamvbytedelta.c
+++ b/cpp/streamvbyte/src/streamvbytedelta.c
@@ -0,0 +1,575 @@
+#include "streamvbyte.h"
+#if defined(_MSC_VER)
+     /* Microsoft C/C++-compatible compiler */
+     #include <intrin.h>
+#elif defined(__GNUC__) && (defined(__x86_64__) || defined(__i386__))
+     /* GCC-compatible compiler, targeting x86/x86-64 */
+     #include <x86intrin.h>
+#elif defined(__GNUC__) && defined(__ARM_NEON__)
+     /* GCC-compatible compiler, targeting ARM with NEON */
+     #include <arm_neon.h>
+#elif defined(__GNUC__) && defined(__IWMMXT__)
+     /* GCC-compatible compiler, targeting ARM with WMMX */
+     #include <mmintrin.h>
+#elif (defined(__GNUC__) || defined(__xlC__)) && (defined(__VEC__) || defined(__ALTIVEC__))
+     /* XLC or GCC-compatible compiler, targeting PowerPC with VMX/VSX */
+     #include <altivec.h>
+#elif defined(__GNUC__) && defined(__SPE__)
+     /* GCC-compatible compiler, targeting PowerPC with SPE */
+     #include <spe.h>
+#endif
+
+static uint8_t lengthTable[256] = { 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9,
+		10, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10, 8, 9, 10, 11, 6, 7, 8, 9, 7, 8,
+		9, 10, 8, 9, 10, 11, 9, 10, 11, 12, 7, 8, 9, 10, 8, 9, 10, 11, 9, 10,
+		11, 12, 10, 11, 12, 13, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10, 8, 9, 10,
+		11, 6, 7, 8, 9, 7, 8, 9, 10, 8, 9, 10, 11, 9, 10, 11, 12, 7, 8, 9, 10,
+		8, 9, 10, 11, 9, 10, 11, 12, 10, 11, 12, 13, 8, 9, 10, 11, 9, 10, 11,
+		12, 10, 11, 12, 13, 11, 12, 13, 14, 6, 7, 8, 9, 7, 8, 9, 10, 8, 9, 10,
+		11, 9, 10, 11, 12, 7, 8, 9, 10, 8, 9, 10, 11, 9, 10, 11, 12, 10, 11, 12,
+		13, 8, 9, 10, 11, 9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14, 9, 10,
+		11, 12, 10, 11, 12, 13, 11, 12, 13, 14, 12, 13, 14, 15, 7, 8, 9, 10, 8,
+		9, 10, 11, 9, 10, 11, 12, 10, 11, 12, 13, 8, 9, 10, 11, 9, 10, 11, 12,
+		10, 11, 12, 13, 11, 12, 13, 14, 9, 10, 11, 12, 10, 11, 12, 13, 11, 12,
+		13, 14, 12, 13, 14, 15, 10, 11, 12, 13, 11, 12, 13, 14, 12, 13, 14, 15,
+		13, 14, 15, 16 };
+
+static uint8_t shuffleTable[256][16] = { { 0, -1, -1, -1, 1, -1, -1, -1, 2, -1,
+		-1, -1, 3, -1, -1, -1 }, // 1111
+		{ 0, 1, -1, -1, 2, -1, -1, -1, 3, -1, -1, -1, 4, -1, -1, -1 },  // 2111
+		{ 0, 1, 2, -1, 3, -1, -1, -1, 4, -1, -1, -1, 5, -1, -1, -1 },   // 3111
+		{ 0, 1, 2, 3, 4, -1, -1, -1, 5, -1, -1, -1, 6, -1, -1, -1 },    // 4111
+		{ 0, -1, -1, -1, 1, 2, -1, -1, 3, -1, -1, -1, 4, -1, -1, -1 },  // 1211
+		{ 0, 1, -1, -1, 2, 3, -1, -1, 4, -1, -1, -1, 5, -1, -1, -1 },   // 2211
+		{ 0, 1, 2, -1, 3, 4, -1, -1, 5, -1, -1, -1, 6, -1, -1, -1 },    // 3211
+		{ 0, 1, 2, 3, 4, 5, -1, -1, 6, -1, -1, -1, 7, -1, -1, -1 },     // 4211
+		{ 0, -1, -1, -1, 1, 2, 3, -1, 4, -1, -1, -1, 5, -1, -1, -1 },   // 1311
+		{ 0, 1, -1, -1, 2, 3, 4, -1, 5, -1, -1, -1, 6, -1, -1, -1 },    // 2311
+		{ 0, 1, 2, -1, 3, 4, 5, -1, 6, -1, -1, -1, 7, -1, -1, -1 },     // 3311
+		{ 0, 1, 2, 3, 4, 5, 6, -1, 7, -1, -1, -1, 8, -1, -1, -1 },      // 4311
+		{ 0, -1, -1, -1, 1, 2, 3, 4, 5, -1, -1, -1, 6, -1, -1, -1 },    // 1411
+		{ 0, 1, -1, -1, 2, 3, 4, 5, 6, -1, -1, -1, 7, -1, -1, -1 },     // 2411
+		{ 0, 1, 2, -1, 3, 4, 5, 6, 7, -1, -1, -1, 8, -1, -1, -1 },      // 3411
+		{ 0, 1, 2, 3, 4, 5, 6, 7, 8, -1, -1, -1, 9, -1, -1, -1 },       // 4411
+		{ 0, -1, -1, -1, 1, -1, -1, -1, 2, 3, -1, -1, 4, -1, -1, -1 },  // 1121
+		{ 0, 1, -1, -1, 2, -1, -1, -1, 3, 4, -1, -1, 5, -1, -1, -1 },   // 2121
+		{ 0, 1, 2, -1, 3, -1, -1, -1, 4, 5, -1, -1, 6, -1, -1, -1 },    // 3121
+		{ 0, 1, 2, 3, 4, -1, -1, -1, 5, 6, -1, -1, 7, -1, -1, -1 },     // 4121
+		{ 0, -1, -1, -1, 1, 2, -1, -1, 3, 4, -1, -1, 5, -1, -1, -1 },   // 1221
+		{ 0, 1, -1, -1, 2, 3, -1, -1, 4, 5, -1, -1, 6, -1, -1, -1 },    // 2221
+		{ 0, 1, 2, -1, 3, 4, -1, -1, 5, 6, -1, -1, 7, -1, -1, -1 },     // 3221
+		{ 0, 1, 2, 3, 4, 5, -1, -1, 6, 7, -1, -1, 8, -1, -1, -1 },      // 4221
+		{ 0, -1, -1, -1, 1, 2, 3, -1, 4, 5, -1, -1, 6, -1, -1, -1 },    // 1321
+		{ 0, 1, -1, -1, 2, 3, 4, -1, 5, 6, -1, -1, 7, -1, -1, -1 },     // 2321
+		{ 0, 1, 2, -1, 3, 4, 5, -1, 6, 7, -1, -1, 8, -1, -1, -1 },      // 3321
+		{ 0, 1, 2, 3, 4, 5, 6, -1, 7, 8, -1, -1, 9, -1, -1, -1 },       // 4321
+		{ 0, -1, -1, -1, 1, 2, 3, 4, 5, 6, -1, -1, 7, -1, -1, -1 },     // 1421
+		{ 0, 1, -1, -1, 2, 3, 4, 5, 6, 7, -1, -1, 8, -1, -1, -1 },      // 2421
+		{ 0, 1, 2, -1, 3, 4, 5, 6, 7, 8, -1, -1, 9, -1, -1, -1 },       // 3421
+		{ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, -1, -1, 10, -1, -1, -1 },       // 4421
+		{ 0, -1, -1, -1, 1, -1, -1, -1, 2, 3, 4, -1, 5, -1, -1, -1 },   // 1131
+		{ 0, 1, -1, -1, 2, -1, -1, -1, 3, 4, 5, -1, 6, -1, -1, -1 },    // 2131
+		{ 0, 1, 2, -1, 3, -1, -1, -1, 4, 5, 6, -1, 7, -1, -1, -1 },     // 3131
+		{ 0, 1, 2, 3, 4, -1, -1, -1, 5, 6, 7, -1, 8, -1, -1, -1 },      // 4131
+		{ 0, -1, -1, -1, 1, 2, -1, -1, 3, 4, 5, -1, 6, -1, -1, -1 },    // 1231
+		{ 0, 1, -1, -1, 2, 3, -1, -1, 4, 5, 6, -1, 7, -1, -1, -1 },     // 2231
+		{ 0, 1, 2, -1, 3, 4, -1, -1, 5, 6, 7, -1, 8, -1, -1, -1 },      // 3231
+		{ 0, 1, 2, 3, 4, 5, -1, -1, 6, 7, 8, -1, 9, -1, -1, -1 },       // 4231
+		{ 0, -1, -1, -1, 1, 2, 3, -1, 4, 5, 6, -1, 7, -1, -1, -1 },     // 1331
+		{ 0, 1, -1, -1, 2, 3, 4, -1, 5, 6, 7, -1, 8, -1, -1, -1 },      // 2331
+		{ 0, 1, 2, -1, 3, 4, 5, -1, 6, 7, 8, -1, 9, -1, -1, -1 },       // 3331
+		{ 0, 1, 2, 3, 4, 5, 6, -1, 7, 8, 9, -1, 10, -1, -1, -1 },       // 4331
+		{ 0, -1, -1, -1, 1, 2, 3, 4, 5, 6, 7, -1, 8, -1, -1, -1 },      // 1431
+		{ 0, 1, -1, -1, 2, 3, 4, 5, 6, 7, 8, -1, 9, -1, -1, -1 },       // 2431
+		{ 0, 1, 2, -1, 3, 4, 5, 6, 7, 8, 9, -1, 10, -1, -1, -1 },       // 3431
+		{ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, -1, 11, -1, -1, -1 },       // 4431
+		{ 0, -1, -1, -1, 1, -1, -1, -1, 2, 3, 4, 5, 6, -1, -1, -1 },    // 1141
+		{ 0, 1, -1, -1, 2, -1, -1, -1, 3, 4, 5, 6, 7, -1, -1, -1 },     // 2141
+		{ 0, 1, 2, -1, 3, -1, -1, -1, 4, 5, 6, 7, 8, -1, -1, -1 },      // 3141
+		{ 0, 1, 2, 3, 4, -1, -1, -1, 5, 6, 7, 8, 9, -1, -1, -1 },       // 4141
+		{ 0, -1, -1, -1, 1, 2, -1, -1, 3, 4, 5, 6, 7, -1, -1, -1 },     // 1241
+		{ 0, 1, -1, -1, 2, 3, -1, -1, 4, 5, 6, 7, 8, -1, -1, -1 },      // 2241
+		{ 0, 1, 2, -1, 3, 4, -1, -1, 5, 6, 7, 8, 9, -1, -1, -1 },       // 3241
+		{ 0, 1, 2, 3, 4, 5, -1, -1, 6, 7, 8, 9, 10, -1, -1, -1 },       // 4241
+		{ 0, -1, -1, -1, 1, 2, 3, -1, 4, 5, 6, 7, 8, -1, -1, -1 },      // 1341
+		{ 0, 1, -1, -1, 2, 3, 4, -1, 5, 6, 7, 8, 9, -1, -1, -1 },       // 2341
+		{ 0, 1, 2, -1, 3, 4, 5, -1, 6, 7, 8, 9, 10, -1, -1, -1 },       // 3341
+		{ 0, 1, 2, 3, 4, 5, 6, -1, 7, 8, 9, 10, 11, -1, -1, -1 },       // 4341
+		{ 0, -1, -1, -1, 1, 2, 3, 4, 5, 6, 7, 8, 9, -1, -1, -1 },       // 1441
+		{ 0, 1, -1, -1, 2, 3, 4, 5, 6, 7, 8, 9, 10, -1, -1, -1 },       // 2441
+		{ 0, 1, 2, -1, 3, 4, 5, 6, 7, 8, 9, 10, 11, -1, -1, -1 },       // 3441
+		{ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, -1, -1, -1 },       // 4441
+		{ 0, -1, -1, -1, 1, -1, -1, -1, 2, -1, -1, -1, 3, 4, -1, -1 },  // 1112
+		{ 0, 1, -1, -1, 2, -1, -1, -1, 3, -1, -1, -1, 4, 5, -1, -1 },   // 2112
+		{ 0, 1, 2, -1, 3, -1, -1, -1, 4, -1, -1, -1, 5, 6, -1, -1 },    // 3112
+		{ 0, 1, 2, 3, 4, -1, -1, -1, 5, -1, -1, -1, 6, 7, -1, -1 },     // 4112
+		{ 0, -1, -1, -1, 1, 2, -1, -1, 3, -1, -1, -1, 4, 5, -1, -1 },   // 1212
+		{ 0, 1, -1, -1, 2, 3, -1, -1, 4, -1, -1, -1, 5, 6, -1, -1 },    // 2212
+		{ 0, 1, 2, -1, 3, 4, -1, -1, 5, -1, -1, -1, 6, 7, -1, -1 },     // 3212
+		{ 0, 1, 2, 3, 4, 5, -1, -1, 6, -1, -1, -1, 7, 8, -1, -1 },      // 4212
+		{ 0, -1, -1, -1, 1, 2, 3, -1, 4, -1, -1, -1, 5, 6, -1, -1 },    // 1312
+		{ 0, 1, -1, -1, 2, 3, 4, -1, 5, -1, -1, -1, 6, 7, -1, -1 },     // 2312
+		{ 0, 1, 2, -1, 3, 4, 5, -1, 6, -1, -1, -1, 7, 8, -1, -1 },      // 3312
+		{ 0, 1, 2, 3, 4, 5, 6, -1, 7, -1, -1, -1, 8, 9, -1, -1 },       // 4312
+		{ 0, -1, -1, -1, 1, 2, 3, 4, 5, -1, -1, -1, 6, 7, -1, -1 },     // 1412
+		{ 0, 1, -1, -1, 2, 3, 4, 5, 6, -1, -1, -1, 7, 8, -1, -1 },      // 2412
+		{ 0, 1, 2, -1, 3, 4, 5, 6, 7, -1, -1, -1, 8, 9, -1, -1 },       // 3412
+		{ 0, 1, 2, 3, 4, 5, 6, 7, 8, -1, -1, -1, 9, 10, -1, -1 },       // 4412
+		{ 0, -1, -1, -1, 1, -1, -1, -1, 2, 3, -1, -1, 4, 5, -1, -1 },   // 1122
+		{ 0, 1, -1, -1, 2, -1, -1, -1, 3, 4, -1, -1, 5, 6, -1, -1 },    // 2122
+		{ 0, 1, 2, -1, 3, -1, -1, -1, 4, 5, -1, -1, 6, 7, -1, -1 },     // 3122
+		{ 0, 1, 2, 3, 4, -1, -1, -1, 5, 6, -1, -1, 7, 8, -1, -1 },      // 4122
+		{ 0, -1, -1, -1, 1, 2, -1, -1, 3, 4, -1, -1, 5, 6, -1, -1 },    // 1222
+		{ 0, 1, -1, -1, 2, 3, -1, -1, 4, 5, -1, -1, 6, 7, -1, -1 },     // 2222
+		{ 0, 1, 2, -1, 3, 4, -1, -1, 5, 6, -1, -1, 7, 8, -1, -1 },      // 3222
+		{ 0, 1, 2, 3, 4, 5, -1, -1, 6, 7, -1, -1, 8, 9, -1, -1 },       // 4222
+		{ 0, -1, -1, -1, 1, 2, 3, -1, 4, 5, -1, -1, 6, 7, -1, -1 },     // 1322
+		{ 0, 1, -1, -1, 2, 3, 4, -1, 5, 6, -1, -1, 7, 8, -1, -1 },      // 2322
+		{ 0, 1, 2, -1, 3, 4, 5, -1, 6, 7, -1, -1, 8, 9, -1, -1 },       // 3322
+		{ 0, 1, 2, 3, 4, 5, 6, -1, 7, 8, -1, -1, 9, 10, -1, -1 },       // 4322
+		{ 0, -1, -1, -1, 1, 2, 3, 4, 5, 6, -1, -1, 7, 8, -1, -1 },      // 1422
+		{ 0, 1, -1, -1, 2, 3, 4, 5, 6, 7, -1, -1, 8, 9, -1, -1 },       // 2422
+		{ 0, 1, 2, -1, 3, 4, 5, 6, 7, 8, -1, -1, 9, 10, -1, -1 },       // 3422
+		{ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, -1, -1, 10, 11, -1, -1 },       // 4422
+		{ 0, -1, -1, -1, 1, -1, -1, -1, 2, 3, 4, -1, 5, 6, -1, -1 },    // 1132
+		{ 0, 1, -1, -1, 2, -1, -1, -1, 3, 4, 5, -1, 6, 7, -1, -1 },     // 2132
+		{ 0, 1, 2, -1, 3, -1, -1, -1, 4, 5, 6, -1, 7, 8, -1, -1 },      // 3132
+		{ 0, 1, 2, 3, 4, -1, -1, -1, 5, 6, 7, -1, 8, 9, -1, -1 },       // 4132
+		{ 0, -1, -1, -1, 1, 2, -1, -1, 3, 4, 5, -1, 6, 7, -1, -1 },     // 1232
+		{ 0, 1, -1, -1, 2, 3, -1, -1, 4, 5, 6, -1, 7, 8, -1, -1 },      // 2232
+		{ 0, 1, 2, -1, 3, 4, -1, -1, 5, 6, 7, -1, 8, 9, -1, -1 },       // 3232
+		{ 0, 1, 2, 3, 4, 5, -1, -1, 6, 7, 8, -1, 9, 10, -1, -1 },       // 4232
+		{ 0, -1, -1, -1, 1, 2, 3, -1, 4, 5, 6, -1, 7, 8, -1, -1 },      // 1332
+		{ 0, 1, -1, -1, 2, 3, 4, -1, 5, 6, 7, -1, 8, 9, -1, -1 },       // 2332
+		{ 0, 1, 2, -1, 3, 4, 5, -1, 6, 7, 8, -1, 9, 10, -1, -1 },       // 3332
+		{ 0, 1, 2, 3, 4, 5, 6, -1, 7, 8, 9, -1, 10, 11, -1, -1 },       // 4332
+		{ 0, -1, -1, -1, 1, 2, 3, 4, 5, 6, 7, -1, 8, 9, -1, -1 },       // 1432
+		{ 0, 1, -1, -1, 2, 3, 4, 5, 6, 7, 8, -1, 9, 10, -1, -1 },       // 2432
+		{ 0, 1, 2, -1, 3, 4, 5, 6, 7, 8, 9, -1, 10, 11, -1, -1 },       // 3432
+		{ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, -1, 11, 12, -1, -1 },       // 4432
+		{ 0, -1, -1, -1, 1, -1, -1, -1, 2, 3, 4, 5, 6, 7, -1, -1 },     // 1142
+		{ 0, 1, -1, -1, 2, -1, -1, -1, 3, 4, 5, 6, 7, 8, -1, -1 },      // 2142
+		{ 0, 1, 2, -1, 3, -1, -1, -1, 4, 5, 6, 7, 8, 9, -1, -1 },       // 3142
+		{ 0, 1, 2, 3, 4, -1, -1, -1, 5, 6, 7, 8, 9, 10, -1, -1 },       // 4142
+		{ 0, -1, -1, -1, 1, 2, -1, -1, 3, 4, 5, 6, 7, 8, -1, -1 },      // 1242
+		{ 0, 1, -1, -1, 2, 3, -1, -1, 4, 5, 6, 7, 8, 9, -1, -1 },       // 2242
+		{ 0, 1, 2, -1, 3, 4, -1, -1, 5, 6, 7, 8, 9, 10, -1, -1 },       // 3242
+		{ 0, 1, 2, 3, 4, 5, -1, -1, 6, 7, 8, 9, 10, 11, -1, -1 },       // 4242
+		{ 0, -1, -1, -1, 1, 2, 3, -1, 4, 5, 6, 7, 8, 9, -1, -1 },       // 1342
+		{ 0, 1, -1, -1, 2, 3, 4, -1, 5, 6, 7, 8, 9, 10, -1, -1 },       // 2342
+		{ 0, 1, 2, -1, 3, 4, 5, -1, 6, 7, 8, 9, 10, 11, -1, -1 },       // 3342
+		{ 0, 1, 2, 3, 4, 5, 6, -1, 7, 8, 9, 10, 11, 12, -1, -1 },       // 4342
+		{ 0, -1, -1, -1, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, -1, -1 },       // 1442
+		{ 0, 1, -1, -1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, -1, -1 },       // 2442
+		{ 0, 1, 2, -1, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, -1, -1 },       // 3442
+		{ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, -1, -1 },       // 4442
+		{ 0, -1, -1, -1, 1, -1, -1, -1, 2, -1, -1, -1, 3, 4, 5, -1 },   // 1113
+		{ 0, 1, -1, -1, 2, -1, -1, -1, 3, -1, -1, -1, 4, 5, 6, -1 },    // 2113
+		{ 0, 1, 2, -1, 3, -1, -1, -1, 4, -1, -1, -1, 5, 6, 7, -1 },     // 3113
+		{ 0, 1, 2, 3, 4, -1, -1, -1, 5, -1, -1, -1, 6, 7, 8, -1 },      // 4113
+		{ 0, -1, -1, -1, 1, 2, -1, -1, 3, -1, -1, -1, 4, 5, 6, -1 },    // 1213
+		{ 0, 1, -1, -1, 2, 3, -1, -1, 4, -1, -1, -1, 5, 6, 7, -1 },     // 2213
+		{ 0, 1, 2, -1, 3, 4, -1, -1, 5, -1, -1, -1, 6, 7, 8, -1 },      // 3213
+		{ 0, 1, 2, 3, 4, 5, -1, -1, 6, -1, -1, -1, 7, 8, 9, -1 },       // 4213
+		{ 0, -1, -1, -1, 1, 2, 3, -1, 4, -1, -1, -1, 5, 6, 7, -1 },     // 1313
+		{ 0, 1, -1, -1, 2, 3, 4, -1, 5, -1, -1, -1, 6, 7, 8, -1 },      // 2313
+		{ 0, 1, 2, -1, 3, 4, 5, -1, 6, -1, -1, -1, 7, 8, 9, -1 },       // 3313
+		{ 0, 1, 2, 3, 4, 5, 6, -1, 7, -1, -1, -1, 8, 9, 10, -1 },       // 4313
+		{ 0, -1, -1, -1, 1, 2, 3, 4, 5, -1, -1, -1, 6, 7, 8, -1 },      // 1413
+		{ 0, 1, -1, -1, 2, 3, 4, 5, 6, -1, -1, -1, 7, 8, 9, -1 },       // 2413
+		{ 0, 1, 2, -1, 3, 4, 5, 6, 7, -1, -1, -1, 8, 9, 10, -1 },       // 3413
+		{ 0, 1, 2, 3, 4, 5, 6, 7, 8, -1, -1, -1, 9, 10, 11, -1 },       // 4413
+		{ 0, -1, -1, -1, 1, -1, -1, -1, 2, 3, -1, -1, 4, 5, 6, -1 },    // 1123
+		{ 0, 1, -1, -1, 2, -1, -1, -1, 3, 4, -1, -1, 5, 6, 7, -1 },     // 2123
+		{ 0, 1, 2, -1, 3, -1, -1, -1, 4, 5, -1, -1, 6, 7, 8, -1 },      // 3123
+		{ 0, 1, 2, 3, 4, -1, -1, -1, 5, 6, -1, -1, 7, 8, 9, -1 },       // 4123
+		{ 0, -1, -1, -1, 1, 2, -1, -1, 3, 4, -1, -1, 5, 6, 7, -1 },     // 1223
+		{ 0, 1, -1, -1, 2, 3, -1, -1, 4, 5, -1, -1, 6, 7, 8, -1 },      // 2223
+		{ 0, 1, 2, -1, 3, 4, -1, -1, 5, 6, -1, -1, 7, 8, 9, -1 },       // 3223
+		{ 0, 1, 2, 3, 4, 5, -1, -1, 6, 7, -1, -1, 8, 9, 10, -1 },       // 4223
+		{ 0, -1, -1, -1, 1, 2, 3, -1, 4, 5, -1, -1, 6, 7, 8, -1 },      // 1323
+		{ 0, 1, -1, -1, 2, 3, 4, -1, 5, 6, -1, -1, 7, 8, 9, -1 },       // 2323
+		{ 0, 1, 2, -1, 3, 4, 5, -1, 6, 7, -1, -1, 8, 9, 10, -1 },       // 3323
+		{ 0, 1, 2, 3, 4, 5, 6, -1, 7, 8, -1, -1, 9, 10, 11, -1 },       // 4323
+		{ 0, -1, -1, -1, 1, 2, 3, 4, 5, 6, -1, -1, 7, 8, 9, -1 },       // 1423
+		{ 0, 1, -1, -1, 2, 3, 4, 5, 6, 7, -1, -1, 8, 9, 10, -1 },       // 2423
+		{ 0, 1, 2, -1, 3, 4, 5, 6, 7, 8, -1, -1, 9, 10, 11, -1 },       // 3423
+		{ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, -1, -1, 10, 11, 12, -1 },       // 4423
+		{ 0, -1, -1, -1, 1, -1, -1, -1, 2, 3, 4, -1, 5, 6, 7, -1 },     // 1133
+		{ 0, 1, -1, -1, 2, -1, -1, -1, 3, 4, 5, -1, 6, 7, 8, -1 },      // 2133
+		{ 0, 1, 2, -1, 3, -1, -1, -1, 4, 5, 6, -1, 7, 8, 9, -1 },       // 3133
+		{ 0, 1, 2, 3, 4, -1, -1, -1, 5, 6, 7, -1, 8, 9, 10, -1 },       // 4133
+		{ 0, -1, -1, -1, 1, 2, -1, -1, 3, 4, 5, -1, 6, 7, 8, -1 },      // 1233
+		{ 0, 1, -1, -1, 2, 3, -1, -1, 4, 5, 6, -1, 7, 8, 9, -1 },       // 2233
+		{ 0, 1, 2, -1, 3, 4, -1, -1, 5, 6, 7, -1, 8, 9, 10, -1 },       // 3233
+		{ 0, 1, 2, 3, 4, 5, -1, -1, 6, 7, 8, -1, 9, 10, 11, -1 },       // 4233
+		{ 0, -1, -1, -1, 1, 2, 3, -1, 4, 5, 6, -1, 7, 8, 9, -1 },       // 1333
+		{ 0, 1, -1, -1, 2, 3, 4, -1, 5, 6, 7, -1, 8, 9, 10, -1 },       // 2333
+		{ 0, 1, 2, -1, 3, 4, 5, -1, 6, 7, 8, -1, 9, 10, 11, -1 },       // 3333
+		{ 0, 1, 2, 3, 4, 5, 6, -1, 7, 8, 9, -1, 10, 11, 12, -1 },       // 4333
+		{ 0, -1, -1, -1, 1, 2, 3, 4, 5, 6, 7, -1, 8, 9, 10, -1 },       // 1433
+		{ 0, 1, -1, -1, 2, 3, 4, 5, 6, 7, 8, -1, 9, 10, 11, -1 },       // 2433
+		{ 0, 1, 2, -1, 3, 4, 5, 6, 7, 8, 9, -1, 10, 11, 12, -1 },       // 3433
+		{ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, -1, 11, 12, 13, -1 },       // 4433
+		{ 0, -1, -1, -1, 1, -1, -1, -1, 2, 3, 4, 5, 6, 7, 8, -1 },      // 1143
+		{ 0, 1, -1, -1, 2, -1, -1, -1, 3, 4, 5, 6, 7, 8, 9, -1 },       // 2143
+		{ 0, 1, 2, -1, 3, -1, -1, -1, 4, 5, 6, 7, 8, 9, 10, -1 },       // 3143
+		{ 0, 1, 2, 3, 4, -1, -1, -1, 5, 6, 7, 8, 9, 10, 11, -1 },       // 4143
+		{ 0, -1, -1, -1, 1, 2, -1, -1, 3, 4, 5, 6, 7, 8, 9, -1 },       // 1243
+		{ 0, 1, -1, -1, 2, 3, -1, -1, 4, 5, 6, 7, 8, 9, 10, -1 },       // 2243
+		{ 0, 1, 2, -1, 3, 4, -1, -1, 5, 6, 7, 8, 9, 10, 11, -1 },       // 3243
+		{ 0, 1, 2, 3, 4, 5, -1, -1, 6, 7, 8, 9, 10, 11, 12, -1 },       // 4243
+		{ 0, -1, -1, -1, 1, 2, 3, -1, 4, 5, 6, 7, 8, 9, 10, -1 },       // 1343
+		{ 0, 1, -1, -1, 2, 3, 4, -1, 5, 6, 7, 8, 9, 10, 11, -1 },       // 2343
+		{ 0, 1, 2, -1, 3, 4, 5, -1, 6, 7, 8, 9, 10, 11, 12, -1 },       // 3343
+		{ 0, 1, 2, 3, 4, 5, 6, -1, 7, 8, 9, 10, 11, 12, 13, -1 },       // 4343
+		{ 0, -1, -1, -1, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, -1 },       // 1443
+		{ 0, 1, -1, -1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, -1 },       // 2443
+		{ 0, 1, 2, -1, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, -1 },       // 3443
+		{ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, -1 },       // 4443
+		{ 0, -1, -1, -1, 1, -1, -1, -1, 2, -1, -1, -1, 3, 4, 5, 6 },    // 1114
+		{ 0, 1, -1, -1, 2, -1, -1, -1, 3, -1, -1, -1, 4, 5, 6, 7 },     // 2114
+		{ 0, 1, 2, -1, 3, -1, -1, -1, 4, -1, -1, -1, 5, 6, 7, 8 },      // 3114
+		{ 0, 1, 2, 3, 4, -1, -1, -1, 5, -1, -1, -1, 6, 7, 8, 9 },       // 4114
+		{ 0, -1, -1, -1, 1, 2, -1, -1, 3, -1, -1, -1, 4, 5, 6, 7 },     // 1214
+		{ 0, 1, -1, -1, 2, 3, -1, -1, 4, -1, -1, -1, 5, 6, 7, 8 },      // 2214
+		{ 0, 1, 2, -1, 3, 4, -1, -1, 5, -1, -1, -1, 6, 7, 8, 9 },       // 3214
+		{ 0, 1, 2, 3, 4, 5, -1, -1, 6, -1, -1, -1, 7, 8, 9, 10 },       // 4214
+		{ 0, -1, -1, -1, 1, 2, 3, -1, 4, -1, -1, -1, 5, 6, 7, 8 },      // 1314
+		{ 0, 1, -1, -1, 2, 3, 4, -1, 5, -1, -1, -1, 6, 7, 8, 9 },       // 2314
+		{ 0, 1, 2, -1, 3, 4, 5, -1, 6, -1, -1, -1, 7, 8, 9, 10 },       // 3314
+		{ 0, 1, 2, 3, 4, 5, 6, -1, 7, -1, -1, -1, 8, 9, 10, 11 },       // 4314
+		{ 0, -1, -1, -1, 1, 2, 3, 4, 5, -1, -1, -1, 6, 7, 8, 9 },       // 1414
+		{ 0, 1, -1, -1, 2, 3, 4, 5, 6, -1, -1, -1, 7, 8, 9, 10 },       // 2414
+		{ 0, 1, 2, -1, 3, 4, 5, 6, 7, -1, -1, -1, 8, 9, 10, 11 },       // 3414
+		{ 0, 1, 2, 3, 4, 5, 6, 7, 8, -1, -1, -1, 9, 10, 11, 12 },       // 4414
+		{ 0, -1, -1, -1, 1, -1, -1, -1, 2, 3, -1, -1, 4, 5, 6, 7 },     // 1124
+		{ 0, 1, -1, -1, 2, -1, -1, -1, 3, 4, -1, -1, 5, 6, 7, 8 },      // 2124
+		{ 0, 1, 2, -1, 3, -1, -1, -1, 4, 5, -1, -1, 6, 7, 8, 9 },       // 3124
+		{ 0, 1, 2, 3, 4, -1, -1, -1, 5, 6, -1, -1, 7, 8, 9, 10 },       // 4124
+		{ 0, -1, -1, -1, 1, 2, -1, -1, 3, 4, -1, -1, 5, 6, 7, 8 },      // 1224
+		{ 0, 1, -1, -1, 2, 3, -1, -1, 4, 5, -1, -1, 6, 7, 8, 9 },       // 2224
+		{ 0, 1, 2, -1, 3, 4, -1, -1, 5, 6, -1, -1, 7, 8, 9, 10 },       // 3224
+		{ 0, 1, 2, 3, 4, 5, -1, -1, 6, 7, -1, -1, 8, 9, 10, 11 },       // 4224
+		{ 0, -1, -1, -1, 1, 2, 3, -1, 4, 5, -1, -1, 6, 7, 8, 9 },       // 1324
+		{ 0, 1, -1, -1, 2, 3, 4, -1, 5, 6, -1, -1, 7, 8, 9, 10 },       // 2324
+		{ 0, 1, 2, -1, 3, 4, 5, -1, 6, 7, -1, -1, 8, 9, 10, 11 },       // 3324
+		{ 0, 1, 2, 3, 4, 5, 6, -1, 7, 8, -1, -1, 9, 10, 11, 12 },       // 4324
+		{ 0, -1, -1, -1, 1, 2, 3, 4, 5, 6, -1, -1, 7, 8, 9, 10 },       // 1424
+		{ 0, 1, -1, -1, 2, 3, 4, 5, 6, 7, -1, -1, 8, 9, 10, 11 },       // 2424
+		{ 0, 1, 2, -1, 3, 4, 5, 6, 7, 8, -1, -1, 9, 10, 11, 12 },       // 3424
+		{ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, -1, -1, 10, 11, 12, 13 },       // 4424
+		{ 0, -1, -1, -1, 1, -1, -1, -1, 2, 3, 4, -1, 5, 6, 7, 8 },      // 1134
+		{ 0, 1, -1, -1, 2, -1, -1, -1, 3, 4, 5, -1, 6, 7, 8, 9 },       // 2134
+		{ 0, 1, 2, -1, 3, -1, -1, -1, 4, 5, 6, -1, 7, 8, 9, 10 },       // 3134
+		{ 0, 1, 2, 3, 4, -1, -1, -1, 5, 6, 7, -1, 8, 9, 10, 11 },       // 4134
+		{ 0, -1, -1, -1, 1, 2, -1, -1, 3, 4, 5, -1, 6, 7, 8, 9 },       // 1234
+		{ 0, 1, -1, -1, 2, 3, -1, -1, 4, 5, 6, -1, 7, 8, 9, 10 },       // 2234
+		{ 0, 1, 2, -1, 3, 4, -1, -1, 5, 6, 7, -1, 8, 9, 10, 11 },       // 3234
+		{ 0, 1, 2, 3, 4, 5, -1, -1, 6, 7, 8, -1, 9, 10, 11, 12 },       // 4234
+		{ 0, -1, -1, -1, 1, 2, 3, -1, 4, 5, 6, -1, 7, 8, 9, 10 },       // 1334
+		{ 0, 1, -1, -1, 2, 3, 4, -1, 5, 6, 7, -1, 8, 9, 10, 11 },       // 2334
+		{ 0, 1, 2, -1, 3, 4, 5, -1, 6, 7, 8, -1, 9, 10, 11, 12 },       // 3334
+		{ 0, 1, 2, 3, 4, 5, 6, -1, 7, 8, 9, -1, 10, 11, 12, 13 },       // 4334
+		{ 0, -1, -1, -1, 1, 2, 3, 4, 5, 6, 7, -1, 8, 9, 10, 11 },       // 1434
+		{ 0, 1, -1, -1, 2, 3, 4, 5, 6, 7, 8, -1, 9, 10, 11, 12 },       // 2434
+		{ 0, 1, 2, -1, 3, 4, 5, 6, 7, 8, 9, -1, 10, 11, 12, 13 },       // 3434
+		{ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, -1, 11, 12, 13, 14 },       // 4434
+		{ 0, -1, -1, -1, 1, -1, -1, -1, 2, 3, 4, 5, 6, 7, 8, 9 },       // 1144
+		{ 0, 1, -1, -1, 2, -1, -1, -1, 3, 4, 5, 6, 7, 8, 9, 10 },       // 2144
+		{ 0, 1, 2, -1, 3, -1, -1, -1, 4, 5, 6, 7, 8, 9, 10, 11 },       // 3144
+		{ 0, 1, 2, 3, 4, -1, -1, -1, 5, 6, 7, 8, 9, 10, 11, 12 },       // 4144
+		{ 0, -1, -1, -1, 1, 2, -1, -1, 3, 4, 5, 6, 7, 8, 9, 10 },       // 1244
+		{ 0, 1, -1, -1, 2, 3, -1, -1, 4, 5, 6, 7, 8, 9, 10, 11 },       // 2244
+		{ 0, 1, 2, -1, 3, 4, -1, -1, 5, 6, 7, 8, 9, 10, 11, 12 },       // 3244
+		{ 0, 1, 2, 3, 4, 5, -1, -1, 6, 7, 8, 9, 10, 11, 12, 13 },       // 4244
+		{ 0, -1, -1, -1, 1, 2, 3, -1, 4, 5, 6, 7, 8, 9, 10, 11 },       // 1344
+		{ 0, 1, -1, -1, 2, 3, 4, -1, 5, 6, 7, 8, 9, 10, 11, 12 },       // 2344
+		{ 0, 1, 2, -1, 3, 4, 5, -1, 6, 7, 8, 9, 10, 11, 12, 13 },       // 3344
+		{ 0, 1, 2, 3, 4, 5, 6, -1, 7, 8, 9, 10, 11, 12, 13, 14 },       // 4344
+		{ 0, -1, -1, -1, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12 },       // 1444
+		{ 0, 1, -1, -1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13 },       // 2444
+		{ 0, 1, 2, -1, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14 },       // 3444
+		{ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 }        // 4444
+};
+
+static uint8_t _encode_data(uint32_t val, uint8_t *__restrict__ *dataPtrPtr) {
+	uint8_t *dataPtr = *dataPtrPtr;
+	uint8_t code;
+
+	if (val < (1 << 8)) { // 1 byte
+		*dataPtr = (uint8_t)(val);
+		*dataPtrPtr += 1;
+		code = 0;
+	} else if (val < (1 << 16)) { // 2 bytes
+		*(uint16_t *) dataPtr = (uint16_t)(val);
+		*dataPtrPtr += 2;
+		code = 1;
+	} else if (val < (1 << 24)) { // 3 bytes
+		*(uint16_t *) dataPtr = (uint16_t)(val);
+		*(dataPtr + 2) = (uint8_t)(val >> 16);
+		*dataPtrPtr += 3;
+		code = 2;
+	} else { // 4 bytes
+		*(uint32_t *) dataPtr = val;
+		*dataPtrPtr += 4;
+		code = 3;
+	}
+
+	return code;
+}
+
+static uint8_t *svb_encode_scalar_d1_init(const uint32_t *in,
+		uint8_t *__restrict__ keyPtr, uint8_t *__restrict__ dataPtr,
+		uint32_t count, uint32_t prev) {
+	if (count == 0)
+		return dataPtr; // exit immediately if no data
+
+	uint8_t shift = 0; // cycles 0, 2, 4, 6, 0, 2, 4, 6, ...
+	uint8_t key = 0;
+	for (uint32_t c = 0; c < count; c++) {
+		if (shift == 8) {
+			shift = 0;
+			*keyPtr++ = key;
+			key = 0;
+		}
+		uint32_t val = in[c] - prev;
+		prev = in[c];
+		uint8_t code = _encode_data(val, &dataPtr);
+		key |= code << shift;
+		shift += 2;
+	}
+
+	*keyPtr = key;  // write last key (no increment needed)
+	return dataPtr; // pointer to first unused data byte
+}
+
+size_t streamvbyte_delta_encode(const uint32_t *in, uint32_t count, uint8_t *out,
+		uint32_t prev) {
+	uint8_t *keyPtr = out;         // keys come immediately after 32-bit count
+	uint32_t keyLen = (count + 3) / 4; // 2-bits rounded to full byte
+	uint8_t *dataPtr = keyPtr + keyLen; // variable byte data after all keys
+
+	return svb_encode_scalar_d1_init(in, keyPtr, dataPtr, count, prev) - out;
+
+}
+
+static inline __m128i _decode_avx(uint32_t key, const uint8_t *__restrict__ *dataPtrPtr) {
+	uint8_t len = lengthTable[key];
+	__m128i Data = _mm_loadu_si128((__m128i *) *dataPtrPtr);
+	__m128i Shuf = *(__m128i *) &shuffleTable[key];
+
+	Data = _mm_shuffle_epi8(Data, Shuf);
+	*dataPtrPtr += len;
+
+	return Data;
+}
+#define BroadcastLastXMM 0xFF // bits 0-7 all set to choose highest element
+
+
+
+static inline void _write_avx(uint32_t *out, __m128i Vec) {
+	_mm_storeu_si128((__m128i *) out, Vec);
+}
+
+static __m128i _write_avx_d1(uint32_t *out, __m128i Vec, __m128i Prev) {
+	__m128i Add = _mm_slli_si128(Vec, 4); // Cycle 1: [- A B C] (already done)
+	Prev = _mm_shuffle_epi32(Prev, BroadcastLastXMM); // Cycle 2: [P P P P]
+	Vec = _mm_add_epi32(Vec, Add);                    // Cycle 2: [A AB BC CD]
+	Add = _mm_slli_si128(Vec, 8);                     // Cycle 3: [- - A AB]
+	Vec = _mm_add_epi32(Vec, Prev);                 // Cycle 3: [PA PAB PBC PCD]
+	Vec = _mm_add_epi32(Vec, Add); // Cycle 4: [PA PAB PABC PABCD]
+
+	_write_avx(out, Vec);
+	return Vec;
+}
+
+#ifndef _MSC_VER
+static __m128i High16To32 = {0xFFFF0B0AFFFF0908, 0xFFFF0F0EFFFF0D0C};
+#else
+static __m128i High16To32 = {8,  9,  -1, -1, 10, 11, -1, -1,
+                           12, 13, -1, -1, 14, 15, -1, -1};
+#endif
+
+static inline __m128i _write_16bit_avx_d1(uint32_t *out, __m128i Vec, __m128i Prev) {
+  // vec == [A B C D E F G H] (16 bit values)
+  __m128i Add = _mm_slli_si128(Vec, 2);               // [- A B C D E F G]
+  Prev = _mm_shuffle_epi32(Prev, BroadcastLastXMM); // [P P P P] (32-bit)
+  Vec = _mm_add_epi32(Vec, Add);                    // [A AB BC CD DE FG GH]
+  Add = _mm_slli_si128(Vec, 4);                     // [- - A AB BC CD DE EF]
+  Vec = _mm_add_epi32(Vec, Add);      // [A AB ABC ABCD BCDE CDEF DEFG EFGH]
+  __m128i V1 = _mm_cvtepu16_epi32(Vec); // [A AB ABC ABCD] (32-bit)
+  V1 = _mm_add_epi32(V1, Prev);       // [PA PAB PABC PABCD] (32-bit)
+  __m128i V2 =
+      _mm_shuffle_epi8(Vec, High16To32); // [BCDE CDEF DEFG EFGH] (32-bit)
+  V2 = _mm_add_epi32(V1, V2); // [PABCDE PABCDEF PABCDEFG PABCDEFGH] (32-bit)
+  _write_avx(out, V1);
+  _write_avx(out + 4, V2);
+  return V2;
+}
+
+static inline uint32_t _decode_data(const uint8_t **dataPtrPtr, uint8_t code) {
+	const uint8_t *dataPtr = *dataPtrPtr;
+	uint32_t val;
+
+	if (code == 0) { // 1 byte
+		val = (uint32_t) * dataPtr;
+		dataPtr += 1;
+	} else if (code == 1) { // 2 bytes
+		val = (uint32_t) * (uint16_t *) dataPtr;
+		dataPtr += 2;
+	} else if (code == 2) { // 3 bytes
+		val = (uint32_t) * (uint16_t *) dataPtr;
+		val |= *(dataPtr + 2) << 16;
+		dataPtr += 3;
+	} else {                      // code == 3
+		val = *(uint32_t *) dataPtr; // 4 bytes
+		dataPtr += 4;
+	}
+
+	*dataPtrPtr = dataPtr;
+	return val;
+}
+
+const uint8_t *svb_decode_scalar_d1_init(uint32_t *outPtr, const uint8_t *keyPtr,
+		const uint8_t *dataPtr, uint32_t count,
+                                   uint32_t prev) {
+  if (count == 0)
+    return dataPtr; // no reads or writes if no data
+
+  uint8_t shift = 0;
+  uint32_t key = *keyPtr++;
+
+  for (uint32_t c = 0; c < count; c++) {
+    if (shift == 8) {
+      shift = 0;
+      key = *keyPtr++;
+    }
+    uint32_t val = _decode_data(&dataPtr, (key >> shift) & 0x3);
+    val += prev;
+    *outPtr++ = val;
+    prev = val;
+    shift += 2;
+  }
+
+  return dataPtr; // pointer to first unused byte after end
+}
+
+const uint8_t *svb_decode_avx_d1_init(uint32_t *out, const uint8_t *__restrict__ keyPtr,
+		const uint8_t *__restrict__ dataPtr, uint64_t count, uint32_t prev) {
+	uint64_t keybytes = count / 4; // number of key bytes
+	if (keybytes >= 8) {
+		__m128i Prev = _mm_set1_epi32(prev);
+		__m128i Data;
+
+		int64_t Offset = -(int64_t) keybytes / 8 + 1;
+
+		const uint64_t *keyPtr64 = (const uint64_t *) keyPtr - Offset;
+		uint64_t nextkeys = keyPtr64[Offset];
+		for (; Offset != 0; ++Offset) {
+			uint64_t keys = nextkeys;
+			nextkeys = keyPtr64[Offset + 1];
+			// faster 16-bit delta since we only have 8-bit values
+			if (!keys) { // 32 1-byte ints in a row
+
+				Data = _mm_cvtepu8_epi16(_mm_lddqu_si128((__m128i *) (dataPtr)));
+				Prev = _write_16bit_avx_d1(out, Data, Prev);
+				Data = _mm_cvtepu8_epi16(
+						_mm_lddqu_si128((__m128i *) (dataPtr + 8)));
+				Prev = _write_16bit_avx_d1(out + 8, Data, Prev);
+				Data = _mm_cvtepu8_epi16(
+						_mm_lddqu_si128((__m128i *) (dataPtr + 16)));
+				Prev = _write_16bit_avx_d1(out + 16, Data, Prev);
+				Data = _mm_cvtepu8_epi16(
+						_mm_lddqu_si128((__m128i *) (dataPtr + 24)));
+				Prev = _write_16bit_avx_d1(out + 24, Data, Prev);
+				out += 32;
+				dataPtr += 32;
+				continue;
+			}
+
+			Data = _decode_avx(keys & 0x00FF, &dataPtr);
+			Prev = _write_avx_d1(out, Data, Prev);
+			Data = _decode_avx((keys & 0xFF00) >> 8, &dataPtr);
+			Prev = _write_avx_d1(out + 4, Data, Prev);
+
+			keys >>= 16;
+			Data = _decode_avx((keys & 0x00FF), &dataPtr);
+			Prev = _write_avx_d1(out + 8, Data, Prev);
+			Data = _decode_avx((keys & 0xFF00) >> 8, &dataPtr);
+			Prev = _write_avx_d1(out + 12, Data, Prev);
+
+			keys >>= 16;
+			Data = _decode_avx((keys & 0x00FF), &dataPtr);
+			Prev = _write_avx_d1(out + 16, Data, Prev);
+			Data = _decode_avx((keys & 0xFF00) >> 8, &dataPtr);
+			Prev = _write_avx_d1(out + 20, Data, Prev);
+
+			keys >>= 16;
+			Data = _decode_avx((keys & 0x00FF), &dataPtr);
+			Prev = _write_avx_d1(out + 24, Data, Prev);
+			Data = _decode_avx((keys & 0xFF00) >> 8, &dataPtr);
+			Prev = _write_avx_d1(out + 28, Data, Prev);
+
+			out += 32;
+		}
+		{
+			uint64_t keys = nextkeys;
+			// faster 16-bit delta since we only have 8-bit values
+			if (!keys) { // 32 1-byte ints in a row
+				Data = _mm_cvtepu8_epi16(_mm_lddqu_si128((__m128i *) (dataPtr)));
+				Prev = _write_16bit_avx_d1(out, Data, Prev);
+				Data = _mm_cvtepu8_epi16(
+						_mm_lddqu_si128((__m128i *) (dataPtr + 8)));
+				Prev = _write_16bit_avx_d1(out + 8, Data, Prev);
+				Data = _mm_cvtepu8_epi16(
+						_mm_lddqu_si128((__m128i *) (dataPtr + 16)));
+				Prev = _write_16bit_avx_d1(out + 16, Data, Prev);
+				Data = _mm_cvtepu8_epi16(
+						_mm_loadl_epi64((__m128i *) (dataPtr + 24)));
+				Prev = _write_16bit_avx_d1(out + 24, Data, Prev);
+				out += 32;
+				dataPtr += 32;
+
+			} else {
+
+				Data = _decode_avx(keys & 0x00FF, &dataPtr);
+				Prev = _write_avx_d1(out, Data, Prev);
+				Data = _decode_avx((keys & 0xFF00) >> 8, &dataPtr);
+				Prev = _write_avx_d1(out + 4, Data, Prev);
+
+				keys >>= 16;
+				Data = _decode_avx((keys & 0x00FF), &dataPtr);
+				Prev = _write_avx_d1(out + 8, Data, Prev);
+				Data = _decode_avx((keys & 0xFF00) >> 8, &dataPtr);
+				Prev = _write_avx_d1(out + 12, Data, Prev);
+
+				keys >>= 16;
+				Data = _decode_avx((keys & 0x00FF), &dataPtr);
+				Prev = _write_avx_d1(out + 16, Data, Prev);
+				Data = _decode_avx((keys & 0xFF00) >> 8, &dataPtr);
+				Prev = _write_avx_d1(out + 20, Data, Prev);
+
+				keys >>= 16;
+				Data = _decode_avx((keys & 0x00FF), &dataPtr);
+				Prev = _write_avx_d1(out + 24, Data, Prev);
+				Data = _decode_avx((keys & 0xFF00) >> 8, &dataPtr);
+				Prev = _write_avx_d1(out + 28, Data, Prev);
+
+				out += 32;
+			}
+		}
+		prev = out[-1];
+	}
+	uint64_t consumedkeys = keybytes - (keybytes & 7);
+	return svb_decode_scalar_d1_init(out, keyPtr + consumedkeys, dataPtr,
+			count & 31, prev);
+}
+
+size_t streamvbyte_delta_decode(const uint8_t* in, uint32_t* out,
+		uint32_t count, uint32_t prev) {
+	uint32_t keyLen = ((count + 3) / 4); // 2-bits per key (rounded up)
+	const uint8_t *keyPtr = in;
+	const uint8_t *dataPtr = keyPtr + keyLen;  // data starts at end of keys
+	return svb_decode_avx_d1_init(out, keyPtr, dataPtr, count, prev) - in;
+}
--- a/cpp/streamvbyte/tests/unit.c
+++ b/cpp/streamvbyte/tests/unit.c
@@ -0,0 +1,73 @@
+#include <stdio.h>
+#include <stdlib.h>
+
+#include "streamvbyte.h"
+#include "streamvbytedelta.h"
+
+int main() {
+	int N = 4096;
+	uint32_t * datain = malloc(N * sizeof(uint32_t));
+	uint8_t * compressedbuffer = malloc(2 * N * sizeof(uint32_t));
+	uint32_t * recovdata = malloc(N * sizeof(uint32_t));
+
+	for (int length = 0; length <= N;) {
+		printf("length = %d \n", length);
+		for (uint32_t gap = 1; gap <= 387420489; gap *= 3) {
+			for (int k = 0; k < length; ++k)
+				datain[k] = gap;
+			size_t compsize = streamvbyte_encode(datain, length,
+					compressedbuffer);
+			size_t usedbytes = streamvbyte_decode(compressedbuffer, recovdata,
+					length);
+			if (compsize != usedbytes) {
+				printf(
+						"[streamvbyte_decode] code is buggy gap = %d, size mismatch %d %d \n",
+						(int) gap, (int) compsize, (int) usedbytes);
+				return -1;
+			}
+			for (int k = 0; k < length; ++k) {
+				if (recovdata[k] != datain[k]) {
+					printf("[streamvbyte_decode] code is buggy gap = %d\n",
+							(int) gap);
+					return -1;
+				}
+			}
+		}
+
+		printf("Delta \n");
+		for (size_t gap = 1; gap <= 531441; gap *= 3) {
+			for (int k = 0; k < length; ++k)
+				datain[k] = gap * k;
+			size_t compsize = streamvbyte_delta_encode(datain, length,
+					compressedbuffer, 0);
+			size_t usedbytes = streamvbyte_delta_decode(compressedbuffer,
+					recovdata, length, 0);
+			if (compsize != usedbytes) {
+				printf(
+						"[streamvbyte_delta_decode] code is buggy gap = %d, size mismatch %d %d \n",
+						(int) gap, (int) compsize, (int) usedbytes);
+				return -1;
+			}
+			for (int k = 0; k < length; ++k) {
+				if (recovdata[k] != datain[k]) {
+					printf(
+							"[streamvbyte_delta_decode] code is buggy gap = %d\n",
+							(int) gap);
+					return -1;
+				}
+			}
+
+		}
+
+		if (length < 128)
+			++length;
+		else {
+			length *= 2;
+		}
+	}
+	free(datain);
+	free(compressedbuffer);
+	free(recovdata);
+	printf("Code looks good.\n");
+	return 0;
+}
--- a/examples/simple_search.rs
+++ b/examples/simple_search.rs
@@ -20,7 +20,10 @@ fn main() {
    }
 }

+
 fn run_example(index_path: &Path) -> tantivy::Result<()> {
+
+
    // # Defining the schema
    //
    // The Tantivy index requires a very strict schema.
@@ -28,6 +31,7 @@ fn run_example(index_path: &Path) -> tantivy::Result<()> {
    // and for each field, its type and "the way it should
    // be indexed".

+
    // first we need to define a schema ...
    let mut schema_builder = SchemaBuilder::default();

@@ -58,6 +62,8 @@ fn run_example(index_path: &Path) -> tantivy::Result<()> {

    let schema = schema_builder.build();

+
+
    // # Indexing documents
    //
    // Let's create a brand new index.
@@ -66,6 +72,7 @@ fn run_example(index_path: &Path) -> tantivy::Result<()> {
    // with our schema in the directory.
    let index = Index::create(index_path, schema.clone())?;

+
    // To insert document we need an index writer.
    // There must be only one writer at a time.
    // This single `IndexWriter` is already
@@ -78,6 +85,7 @@ fn run_example(index_path: &Path) -> tantivy::Result<()> {
    // Let's index our documents!
    // We first need a handle on the title and the body field.

+
    // ### Create a document "manually".
    //
    // We can create a document manually, by setting the fields
@@ -90,7 +98,7 @@ fn run_example(index_path: &Path) -> tantivy::Result<()> {
    old_man_doc.add_text(
        body,
        "He was an old man who fished alone in a skiff in the Gulf Stream and \
-         he had gone eighty-four days now without taking a fish.",
+                          he had gone eighty-four days now without taking a fish.",
    );

    // ... and add it to the `IndexWriter`.
@@ -137,6 +145,7 @@ fn run_example(index_path: &Path) -> tantivy::Result<()> {
    // Indexing 5 million articles of the English wikipedia takes
    // around 4 minutes on my computer!

+
    // ### Committing
    //
    // At this point our documents are not searchable.
@@ -158,6 +167,7 @@ fn run_example(index_path: &Path) -> tantivy::Result<()> {
    // tantivy behaves as if has rolled back to its last
    // commit.

+
    // # Searching
    //
    // Let's search our index. Start by reloading
@@ -182,6 +192,7 @@ fn run_example(index_path: &Path) -> tantivy::Result<()> {
    // A ticket has been opened regarding this problem.
    let query = query_parser.parse_query("sea whale")?;

+
    // A query defines a set of documents, as
    // well as the way they should be scored.
    //
--- a/rustfmt.toml
+++ b/rustfmt.toml
@@ -1 +0,0 @@
-use_try_shorthand = true
--- a/script/build-doc.sh
+++ b/script/build-doc.sh
@@ -0,0 +1,10 @@
+#!/bin/bash
+DEST=target/doc/tantivy/docs/
+mkdir -p $DEST
+
+for f in $(ls docs/*.md)
+do
+    rustdoc $f -o $DEST --markdown-css ../../rustdoc.css --markdown-css style.css
+done
+
+cp docs/*.css $DEST
--- a/script/profile.sh
+++ b/script/profile.sh
@@ -0,0 +1,5 @@
+#/bin/bash
+valgrind --tool=cachegrind target/release/tantivy-bench -i /data/wiki-index -q ./queries.txt -n 3
+valgrind --tool=callgrind target/release/tantivy-bench -i /data/wiki-index -q ./queries.txt -n 3
+
+
--- a/src/collector/chained_collector.rs
+++ b/src/collector/chained_collector.rs
@@ -16,10 +16,6 @@ impl Collector for DoNothingCollector {
    }
    #[inline]
    fn collect(&mut self, _doc: DocId, _score: Score) {}
-    #[inline]
-    fn requires_scoring(&self) -> bool {
-        false
-    }
 }

 /// Zero-cost abstraction used to collect on multiple collectors.
@@ -46,8 +42,8 @@ impl<Left: Collector, Right: Collector> Collector for ChainedCollector<Left, Rig
        segment_local_id: SegmentLocalId,
        segment: &SegmentReader,
    ) -> Result<()> {
-        self.left.set_segment(segment_local_id, segment)?;
-        self.right.set_segment(segment_local_id, segment)?;
+        try!(self.left.set_segment(segment_local_id, segment));
+        try!(self.right.set_segment(segment_local_id, segment));
        Ok(())
    }

@@ -55,10 +51,6 @@ impl<Left: Collector, Right: Collector> Collector for ChainedCollector<Left, Rig
        self.left.collect(doc, score);
        self.right.collect(doc, score);
    }
-
-    fn requires_scoring(&self) -> bool {
-        self.left.requires_scoring() || self.right.requires_scoring()
-    }
 }

 /// Creates a `ChainedCollector`
--- a/src/collector/count_collector.rs
+++ b/src/collector/count_collector.rs
@@ -7,7 +7,6 @@ use SegmentLocalId;

 /// `CountCollector` collector only counts how many
 /// documents match the query.
-#[derive(Default)]
 pub struct CountCollector {
    count: usize,
 }
@@ -20,6 +19,12 @@ impl CountCollector {
    }
 }

+impl Default for CountCollector {
+    fn default() -> CountCollector {
+        CountCollector { count: 0 }
+    }
+}
+
 impl Collector for CountCollector {
    fn set_segment(&mut self, _: SegmentLocalId, _: &SegmentReader) -> Result<()> {
        Ok(())
@@ -28,27 +33,23 @@ impl Collector for CountCollector {
    fn collect(&mut self, _: DocId, _: Score) {
        self.count += 1;
    }
-
-    fn requires_scoring(&self) -> bool {
-        false
-    }
 }

 #[cfg(test)]
 mod tests {

-    use collector::{Collector, CountCollector};
+    use super::*;
+    use test::Bencher;
+    use collector::Collector;

-    #[test]
-    fn test_count_collector() {
-        let mut count_collector = CountCollector::default();
-        assert_eq!(count_collector.count(), 0);
-        count_collector.collect(0u32, 1f32);
-        assert_eq!(count_collector.count(), 1);
-        assert_eq!(count_collector.count(), 1);
-        count_collector.collect(1u32, 1f32);
-        assert_eq!(count_collector.count(), 2);
-        assert!(!count_collector.requires_scoring());
+    #[bench]
+    fn build_collector(b: &mut Bencher) {
+        b.iter(|| {
+            let mut count_collector = CountCollector::default();
+            for doc in 0..1_000_000 {
+                count_collector.collect(doc, 1f32);
+            }
+            count_collector.count()
+        });
    }
-
 }
--- a/src/collector/facet_collector.rs
+++ b/src/collector/facet_collector.rs
@@ -1,637 +1,113 @@
-use std::mem;
+use std::cmp::Eq;
+use std::collections::HashMap;
+use std::hash::Hash;
+
 use collector::Collector;
-use fastfield::FacetReader;
+use fastfield::FastFieldReader;
 use schema::Field;
-use std::cell::UnsafeCell;
-use schema::Facet;
-use std::collections::BTreeMap;
-use std::collections::BinaryHeap;
-use std::collections::Bound;
-use termdict::TermDictionary;
-use termdict::TermStreamer;
-use termdict::TermStreamerBuilder;
-use std::collections::BTreeSet;
-use termdict::TermMerger;
-use docset::SkipResult;
-use std::{usize, u64};
-use std::iter::Peekable;

 use DocId;
 use Result;
 use Score;
 use SegmentReader;
 use SegmentLocalId;
-use std::cmp::Ordering;

-struct Hit<'a> {
-    count: u64,
-    facet: &'a Facet,
-}
-
-impl<'a> Eq for Hit<'a> {}
-
-impl<'a> PartialEq<Hit<'a>> for Hit<'a> {
-    fn eq(&self, other: &Hit) -> bool {
-        self.count == other.count
-    }
-}
-
-impl<'a> PartialOrd<Hit<'a>> for Hit<'a> {
-    fn partial_cmp(&self, other: &Hit) -> Option<Ordering> {
-        Some(self.cmp(other))
-    }
-}
-
-impl<'a> Ord for Hit<'a> {
-    fn cmp(&self, other: &Self) -> Ordering {
-        other.count.cmp(&self.count)
-    }
-}
-
-struct SegmentFacetCounter {
-    pub facet_reader: FacetReader,
-    pub facet_ords: Vec<u64>,
-    pub facet_counts: Vec<u64>,
-}
-
-fn facet_depth(facet_bytes: &[u8]) -> usize {
-    if facet_bytes.is_empty() {
-        0
-    } else {
-        facet_bytes.iter().cloned().filter(|b| *b == 0u8).count() + 1
-    }
-}
-
-/// Collector for faceting
-///
-/// The collector collects all facets. You need to configure it
-/// beforehand with the facet you want to extract.
-///
-/// This is done by calling `.add_facet(...)` with the root of the
-/// facet you want to extract as argument.
-///
-/// Facet counts will only be computed for the facet that are direct children
-/// of such a root facet.
-///
-/// For instance, if your index represents books, your hierarchy of facets
-/// may contain `category`, `language`.
-///
-/// The category facet may include `subcategories`. For instance, a book
-/// could belong to `/category/fiction/fantasy`.
-///
-/// If you request the facet counts for `/category`, the result will be
-/// the breakdown of counts for the direct children of `/category`
-/// (e.g. `/category/fiction`, `/category/biography`, `/category/personal_development`).
-///
-/// Once collection is finished, you can harvest its results in the form
-/// of a `FacetCounts` object, and extract your face                t counts from it.
-///
-/// This implementation assumes you are working with a number of facets that
-/// is much hundreds of time lower than your number of documents.
-///
-///
-/// ```rust
-/// #[macro_use]
-/// extern crate tantivy;
-/// use tantivy::schema::{Facet, SchemaBuilder, TEXT};
-/// use tantivy::{Index, Result};
-/// use tantivy::collector::FacetCollector;
-/// use tantivy::query::AllQuery;
-///
-/// # fn main() { example().unwrap(); }
-/// fn example() -> Result<()> {
-///     let mut schema_builder = SchemaBuilder::new();
-///
-///     // Facet have their own specific type.
-///     // It is not a bad practise to put all of your
-///     // facet information in the same field.
-///     let facet = schema_builder.add_facet_field("facet");
-///     let title = schema_builder.add_text_field("title", TEXT);
-///     let schema = schema_builder.build();
-///     let index = Index::create_in_ram(schema);
-///     {
-///         let mut index_writer = index.writer(3_000_000)?;
-///         // a document can be associated to any number of facets
-///         index_writer.add_document(doc!(
-///             title => "The Name of the Wind",
-///             facet => Facet::from("/lang/en"),
-///             facet => Facet::from("/category/fiction/fantasy")
-///         ));
-///         index_writer.add_document(doc!(
-///             title => "Dune",
-///             facet => Facet::from("/lang/en"),
-///             facet => Facet::from("/category/fiction/sci-fi")
-///         ));
-///         index_writer.add_document(doc!(
-///             title => "La Vénus d'Ille",
-///             facet => Facet::from("/lang/fr"),
-///             facet => Facet::from("/category/fiction/fantasy"),
-///             facet => Facet::from("/category/fiction/horror")
-///         ));
-///         index_writer.add_document(doc!(
-///             title => "The Diary of a Young Girl",
-///             facet => Facet::from("/lang/en"),
-///             facet => Facet::from("/category/biography")
-///         ));
-///         index_writer.commit().unwrap();
-///     }
-///
-///     index.load_searchers()?;
-///     let searcher = index.searcher();
-///
-///     {
-///			let mut facet_collector = FacetCollector::for_field(facet);
-///         facet_collector.add_facet("/lang");
-///         facet_collector.add_facet("/category");
-///         searcher.search(&AllQuery, &mut facet_collector).unwrap();
-///
-///         // this object contains count aggregate for all of the facets.
-///         let counts = facet_collector.harvest();
-///
-///         // This lists all of the facet counts
-///         let facets: Vec<(&Facet, u64)> = counts
-///             .get("/category")
-///             .collect();
-///         assert_eq!(facets, vec![
-///             (&Facet::from("/category/biography"), 1),
-///             (&Facet::from("/category/fiction"), 3)
-///         ]);
-///     }
-///
-///     {
-///			let mut facet_collector = FacetCollector::for_field(facet);
-///         facet_collector.add_facet("/category/fiction");
-///         searcher.search(&AllQuery, &mut facet_collector).unwrap();
-///
-///         // this object contains count aggregate for all of the facets.
-///         let counts = facet_collector.harvest();
-///
-///         // This lists all of the facet counts
-///         let facets: Vec<(&Facet, u64)> = counts
-///             .get("/category/fiction")
-///             .collect();
-///         assert_eq!(facets, vec![
-///             (&Facet::from("/category/fiction/fantasy"), 2),
-///             (&Facet::from("/category/fiction/horror"), 1),
-///             (&Facet::from("/category/fiction/sci-fi"), 1)
-///         ]);
-///     }
-///
-///    {
-///			let mut facet_collector = FacetCollector::for_field(facet);
-///         facet_collector.add_facet("/category/fiction");
-///         searcher.search(&AllQuery, &mut facet_collector).unwrap();
-///
-///         // this object contains count aggregate for all of the facets.
-///         let counts = facet_collector.harvest();
-///
-///         // This lists all of the facet counts
-///         let facets: Vec<(&Facet, u64)> = counts.top_k("/category/fiction", 1);
-///         assert_eq!(facets, vec![
-///             (&Facet::from("/category/fiction/fantasy"), 2)
-///         ]);
-///     }
-///
-///     Ok(())
-/// }
-/// ```
-pub struct FacetCollector {
-    facet_ords: Vec<u64>,
+/// Facet collector  for i64/u64 fast field
+pub struct FacetCollector<T>
+where
+    T: FastFieldReader,
+    T::ValueType: Eq + Hash,
+{
+    counters: HashMap<T::ValueType, u64>,
    field: Field,
-    ff_reader: Option<UnsafeCell<FacetReader>>,
-    segment_counters: Vec<SegmentFacetCounter>,
-
-    // facet_ord -> collapse facet_id
-    current_segment_collapse_mapping: Vec<usize>,
-    // collapse facet_id -> count
-    current_segment_counts: Vec<u64>,
-    // collapse facet_id -> facet_ord
-    current_collapse_facet_ords: Vec<u64>,
-
-    facets: BTreeSet<Facet>,
+    ff_reader: Option<T>,
 }

-fn skip<'a, I: Iterator<Item = &'a Facet>>(
-    target: &[u8],
-    collapse_it: &mut Peekable<I>,
-) -> SkipResult {
-    loop {
-        match collapse_it.peek() {
-            Some(facet_bytes) => match facet_bytes.encoded_bytes().cmp(target) {
-                Ordering::Less => {}
-                Ordering::Greater => {
-                    return SkipResult::OverStep;
-                }
-                Ordering::Equal => {
-                    return SkipResult::Reached;
-                }
-            },
-            None => {
-                return SkipResult::End;
-            }
-        }
-        collapse_it.next();
-    }
-}
-
-impl FacetCollector {
-    /// Create a facet collector to collect the facets
-    /// from a specific facet `Field`.
-    ///
-    /// This function does not check whether the field
-    /// is of the proper type.
-    pub fn for_field(field: Field) -> FacetCollector {
+impl<T> FacetCollector<T>
+where
+    T: FastFieldReader,
+    T::ValueType: Eq + Hash,
+{
+    /// Creates a new facet collector for aggregating a given field.
+    pub fn new(field: Field) -> FacetCollector<T> {
        FacetCollector {
-            facet_ords: Vec::with_capacity(255),
-            segment_counters: Vec::new(),
-            field,
+            counters: HashMap::new(),
+            field: field,
            ff_reader: None,
-            facets: BTreeSet::new(),
-
-            current_segment_collapse_mapping: Vec::new(),
-            current_collapse_facet_ords: Vec::new(),
-            current_segment_counts: Vec::new(),
        }
    }
-
-    /// Adds a facet that we want to record counts
-    ///
-    /// Adding facet `Facet::from("/country")` for instance,
-    /// will record the counts of all of the direct children of the facet country
-    /// (e.g. `/country/FR`, `/country/UK`).
-    ///
-    /// Adding two facets within which one is the prefix of the other is forbidden.
-    /// If you need the correct number of unique documents for two such facets,
-    /// just add them in separate `FacetCollector`.
-    pub fn add_facet<T>(&mut self, facet_from: T)
-    where
-        Facet: From<T>,
-    {
-        let facet = Facet::from(facet_from);
-        for old_facet in &self.facets {
-            assert!(
-                !old_facet.is_prefix_of(&facet),
-                "Tried to add a facet which is a descendant of an already added facet."
-            );
-            assert!(
-                !facet.is_prefix_of(old_facet),
-                "Tried to add a facet which is an ancestor of an already added facet."
-            );
-        }
-        self.facets.insert(facet);
-    }
-
-    fn set_collapse_mapping(&mut self, facet_reader: &FacetReader) {
-        self.current_segment_collapse_mapping.clear();
-        self.current_collapse_facet_ords.clear();
-        self.current_segment_counts.clear();
-        let mut collapse_facet_it = self.facets.iter().peekable();
-        self.current_collapse_facet_ords.push(0);
-        let mut facet_streamer = facet_reader.facet_dict().range().into_stream();
-        if !facet_streamer.advance() {
-            return;
-        }
-        'outer: loop {
-            // at the begining of this loop, facet_streamer
-            // is positionned on a term that has not been processed yet.
-            let skip_result = skip(facet_streamer.key(), &mut collapse_facet_it);
-            match skip_result {
-                SkipResult::Reached => {
-                    // we reach a facet we decided to collapse.
-                    let collapse_depth = facet_depth(facet_streamer.key());
-                    let mut collapsed_id = 0;
-                    self.current_segment_collapse_mapping.push(0);
-                    while facet_streamer.advance() {
-                        let depth = facet_depth(facet_streamer.key());
-                        if depth <= collapse_depth {
-                            continue 'outer;
-                        }
-                        if depth == collapse_depth + 1 {
-                            collapsed_id = self.current_collapse_facet_ords.len();
-                            self.current_collapse_facet_ords
-                                .push(facet_streamer.term_ord());
-                            self.current_segment_collapse_mapping.push(collapsed_id);
-                        } else {
-                            self.current_segment_collapse_mapping.push(collapsed_id);
-                        }
-                    }
-                    break;
-                }
-                SkipResult::End | SkipResult::OverStep => {
-                    self.current_segment_collapse_mapping.push(0);
-                    if !facet_streamer.advance() {
-                        break;
-                    }
-                }
-            }
-        }
-    }
-
-    fn finalize_segment(&mut self) {
-        if self.ff_reader.is_some() {
-            self.segment_counters.push(SegmentFacetCounter {
-                facet_reader: self.ff_reader.take().unwrap().into_inner(),
-                facet_ords: mem::replace(&mut self.current_collapse_facet_ords, Vec::new()),
-                facet_counts: mem::replace(&mut self.current_segment_counts, Vec::new()),
-            });
-        }
-    }
-
-    /// Returns the results of the collection.
-    ///
-    /// This method does not just return the counters,
-    /// it also translates the facet ordinals of the last segment.
-    pub fn harvest(mut self) -> FacetCounts {
-        self.finalize_segment();
-
-        let collapsed_facet_ords: Vec<&[u64]> = self.segment_counters
-            .iter()
-            .map(|segment_counter| &segment_counter.facet_ords[..])
-            .collect();
-        let collapsed_facet_counts: Vec<&[u64]> = self.segment_counters
-            .iter()
-            .map(|segment_counter| &segment_counter.facet_counts[..])
-            .collect();
-
-        let facet_streams = self.segment_counters
-            .iter()
-            .map(|seg_counts| seg_counts.facet_reader.facet_dict().range().into_stream())
-            .collect::<Vec<_>>();
-
-        let mut facet_merger = TermMerger::new(facet_streams);
-        let mut facet_counts = BTreeMap::new();
-
-        while facet_merger.advance() {
-            let count = facet_merger
-                .current_kvs()
-                .iter()
-                .map(|it| {
-                    let seg_ord = it.segment_ord;
-                    let term_ord = it.streamer.term_ord();
-                    collapsed_facet_ords[seg_ord]
-                        .binary_search(&term_ord)
-                        .map(|collapsed_term_id| {
-                            if collapsed_term_id == 0 {
-                                0
-                            } else {
-                                collapsed_facet_counts[seg_ord][collapsed_term_id]
-                            }
-                        })
-                        .unwrap_or(0)
-                })
-                .sum();
-            if count > 0u64 {
-                let bytes = facet_merger.key().to_owned();
-                facet_counts.insert(Facet::from_encoded(bytes), count);
-            }
-        }
-        FacetCounts { facet_counts }
-    }
 }

-impl Collector for FacetCollector {
+impl<T> Collector for FacetCollector<T>
+where
+    T: FastFieldReader,
+    T::ValueType: Eq + Hash,
+{
    fn set_segment(&mut self, _: SegmentLocalId, reader: &SegmentReader) -> Result<()> {
-        self.finalize_segment();
-        let facet_reader = reader.facet_reader(self.field)?;
-        self.set_collapse_mapping(&facet_reader);
-        self.current_segment_counts
-            .resize(self.current_collapse_facet_ords.len(), 0);
-        self.ff_reader = Some(UnsafeCell::new(facet_reader));
+        self.ff_reader = Some(reader.get_fast_field_reader(self.field)?);
        Ok(())
    }

    fn collect(&mut self, doc: DocId, _: Score) {
-        let facet_reader: &mut FacetReader = unsafe {
-            &mut *self.ff_reader
-                .as_ref()
-                .expect("collect() was called before set_segment. This should never happen.")
-                .get()
-        };
-        facet_reader.facet_ords(doc, &mut self.facet_ords);
-        let mut previous_collapsed_ord: usize = usize::MAX;
-        for &facet_ord in &self.facet_ords {
-            let collapsed_ord = self.current_segment_collapse_mapping[facet_ord as usize];
-            self.current_segment_counts[collapsed_ord] += if collapsed_ord == previous_collapsed_ord
-            {
-                0
-            } else {
-                1
-            };
-            previous_collapsed_ord = collapsed_ord;
-        }
-    }
-
-    fn requires_scoring(&self) -> bool {
-        false
-    }
-}
-
-/// Intermediary result of the `FacetCollector` that stores
-/// the facet counts for all the segments.
-pub struct FacetCounts {
-    facet_counts: BTreeMap<Facet, u64>,
-}
-
-impl FacetCounts {
-    #[allow(needless_lifetimes)] //< compiler fails if we remove the lifetime
-    pub fn get<'a, T>(&'a self, facet_from: T) -> impl Iterator<Item = (&'a Facet, u64)>
-    where
-        Facet: From<T>,
-    {
-        let facet = Facet::from(facet_from);
-        let left_bound = Bound::Excluded(facet.clone());
-        let right_bound = if facet.is_root() {
-            Bound::Unbounded
-        } else {
-            let mut facet_after_bytes = facet.encoded_bytes().to_owned();
-            facet_after_bytes.push(1u8);
-            let facet_after = Facet::from_encoded(facet_after_bytes);
-            Bound::Excluded(facet_after)
-        };
-
-        self.facet_counts
-            .range((left_bound, right_bound))
-            .map(|(facet, count)| (facet, *count))
-    }
-
-    pub fn top_k<T>(&self, facet: T, k: usize) -> Vec<(&Facet, u64)>
-    where
-        Facet: From<T>,
-    {
-        let mut heap = BinaryHeap::with_capacity(k);
-        let mut it = self.get(facet);
-
-        for (facet, count) in (&mut it).take(k) {
-            heap.push(Hit { count, facet });
-        }
-
-        let mut lowest_count: u64 = heap.peek().map(|hit| hit.count).unwrap_or(u64::MIN);
-        for (facet, count) in it {
-            if count > lowest_count {
-                lowest_count = count;
-                if let Some(mut head) = heap.peek_mut() {
-                    *head = Hit { count, facet };
-                }
-            }
-        }
-        heap.into_sorted_vec()
-            .into_iter()
-            .map(|hit| (hit.facet, hit.count))
-            .collect::<Vec<_>>()
+        let val = self.ff_reader
+            .as_ref()
+            .expect("collect() was called before set_segment. This should never happen.")
+            .get(doc);
+        *(self.counters.entry(val).or_insert(0)) += 1;
    }
 }

 #[cfg(test)]
 mod tests {
-    use test::Bencher;
-    use core::Index;
-    use schema::{Document, Facet, SchemaBuilder};
-    use query::AllQuery;
-    use super::{FacetCollector, FacetCounts};
-    use std::iter;
-    use schema::Field;
-    use rand::{thread_rng, Rng};
+
+    use collector::{chain, FacetCollector};
+    use query::QueryParser;
+    use fastfield::{I64FastFieldReader, U64FastFieldReader};
+    use schema::{self, FAST, STRING};
+    use Index;

    #[test]
-    fn test_facet_collector_drilldown() {
-        let mut schema_builder = SchemaBuilder::new();
-        let facet_field = schema_builder.add_facet_field("facet");
+    // create 10 documents, set num field value to 0 or 1 for even/odd ones
+    // make sure we have facet counters correctly filled
+    fn test_facet_collector_results() {
+        let mut schema_builder = schema::SchemaBuilder::new();
+        let num_field_i64 = schema_builder.add_i64_field("num_i64", FAST);
+        let num_field_u64 = schema_builder.add_u64_field("num_u64", FAST);
+        let text_field = schema_builder.add_text_field("text", STRING);
        let schema = schema_builder.build();
-        let index = Index::create_in_ram(schema);

-        let mut index_writer = index.writer(3_000_000).unwrap();
-        let num_facets: usize = 3 * 4 * 5;
-        let facets: Vec<Facet> = (0..num_facets)
-            .map(|mut n| {
-                let top = n % 3;
-                n /= 3;
-                let mid = n % 4;
-                n /= 4;
-                let leaf = n % 5;
-                Facet::from(&format!("/top{}/mid{}/leaf{}", top, mid, leaf))
-            })
-            .collect();
-        for i in 0..num_facets * 10 {
-            let mut doc = Document::new();
-            doc.add_facet(facet_field, facets[i % num_facets].clone());
-            index_writer.add_document(doc);
-        }
-        index_writer.commit().unwrap();
-        index.load_searchers().unwrap();
-        let searcher = index.searcher();
+        let index = Index::create_in_ram(schema.clone());

-        let mut facet_collector = FacetCollector::for_field(facet_field);
-        facet_collector.add_facet(Facet::from("/top1"));
-        searcher.search(&AllQuery, &mut facet_collector).unwrap();
-
-        let counts: FacetCounts = facet_collector.harvest();
        {
-            let facets: Vec<(String, u64)> = counts
-                .get("/top1")
-                .map(|(facet, count)| (facet.to_string(), count))
-                .collect();
-            assert_eq!(
-                facets,
-                [
-                    ("/top1/mid0", 50),
-                    ("/top1/mid1", 50),
-                    ("/top1/mid2", 50),
-                    ("/top1/mid3", 50),
-                ].iter()
-                    .map(|&(facet_str, count)| (String::from(facet_str), count))
-                    .collect::<Vec<_>>()
-            );
-        }
-    }
-
-    #[test]
-    #[should_panic(expected = "Tried to add a facet which is a descendant of \
-                               an already added facet.")]
-    fn test_misused_facet_collector() {
-        let mut facet_collector = FacetCollector::for_field(Field(0));
-        facet_collector.add_facet(Facet::from("/country"));
-        facet_collector.add_facet(Facet::from("/country/europe"));
-    }
-
-    #[test]
-    fn test_non_used_facet_collector() {
-        let mut facet_collector = FacetCollector::for_field(Field(0));
-        facet_collector.add_facet(Facet::from("/country"));
-        facet_collector.add_facet(Facet::from("/countryeurope"));
-    }
-
-    #[test]
-    fn test_facet_collector_topk() {
-        let mut schema_builder = SchemaBuilder::new();
-        let facet_field = schema_builder.add_facet_field("facet");
-        let schema = schema_builder.build();
-        let index = Index::create_in_ram(schema);
-
-        let mut docs: Vec<Document> = vec![("a", 10), ("b", 100), ("c", 7), ("d", 12), ("e", 21)]
-            .into_iter()
-            .flat_map(|(c, count)| {
-                let facet = Facet::from(&format!("/facet_{}", c));
-                let doc = doc!(facet_field => facet);
-                iter::repeat(doc).take(count)
-            })
-            .collect();
-        thread_rng().shuffle(&mut docs[..]);
-
-        let mut index_writer = index.writer(3_000_000).unwrap();
-        for doc in docs {
-            index_writer.add_document(doc);
-        }
-        index_writer.commit().unwrap();
-        index.load_searchers().unwrap();
-
-        let searcher = index.searcher();
-
-        let mut facet_collector = FacetCollector::for_field(facet_field);
-        facet_collector.add_facet("/");
-        searcher.search(&AllQuery, &mut facet_collector).unwrap();
-
-        let counts: FacetCounts = facet_collector.harvest();
-        {
-            let facets: Vec<(&Facet, u64)> = counts.top_k("/", 3);
-            assert_eq!(
-                facets,
-                vec![
-                    (&Facet::from("/facet_b"), 100),
-                    (&Facet::from("/facet_e"), 21),
-                    (&Facet::from("/facet_d"), 12),
-                ]
-            );
-        }
-    }
-
-    #[bench]
-    fn bench_facet_collector(b: &mut Bencher) {
-        let mut schema_builder = SchemaBuilder::new();
-        let facet_field = schema_builder.add_facet_field("facet");
-        let schema = schema_builder.build();
-        let index = Index::create_in_ram(schema);
-
-        let mut docs = vec![];
-        for val in 0..50 {
-            let facet = Facet::from(&format!("/facet_{}", val));
-            for _ in 0..val * val {
-                docs.push(doc!(facet_field=>facet.clone()));
+            let mut index_writer = index.writer_with_num_threads(1, 40_000_000).unwrap();
+            {
+                for i in 0u64..10u64 {
+                    index_writer.add_document(doc!(
+                        num_field_i64 => ((i as i64) % 3i64) as i64,
+                        num_field_u64 => (i % 2u64) as u64,
+                        text_field => "text"
+                    ));
+                }
            }
+            assert_eq!(index_writer.commit().unwrap(), 10u64);
        }
-        // 40425 docs
-        thread_rng().shuffle(&mut docs[..]);

-        let mut index_writer = index.writer(3_000_000).unwrap();
-        for doc in docs {
-            index_writer.add_document(doc);
-        }
-        index_writer.commit().unwrap();
        index.load_searchers().unwrap();
+        let searcher = index.searcher();
+        let mut ffvf_i64: FacetCollector<I64FastFieldReader> = FacetCollector::new(num_field_i64);
+        let mut ffvf_u64: FacetCollector<U64FastFieldReader> = FacetCollector::new(num_field_u64);

-        b.iter(|| {
-            let searcher = index.searcher();
-            let mut facet_collector = FacetCollector::for_field(facet_field);
-            searcher.search(&AllQuery, &mut facet_collector).unwrap();
-        });
+        {
+            // perform the query
+            let mut facet_collectors = chain().push(&mut ffvf_i64).push(&mut ffvf_u64);
+            let query_parser = QueryParser::for_index(&index, vec![text_field]);
+            let query = query_parser.parse_query("text:text").unwrap();
+            query.search(&searcher, &mut facet_collectors).unwrap();
+        }
+
+        assert_eq!(ffvf_u64.counters[&0], 5);
+        assert_eq!(ffvf_u64.counters[&1], 5);
+        assert_eq!(ffvf_i64.counters[&0], 4);
+        assert_eq!(ffvf_i64.counters[&1], 3);
    }
 }
--- a/src/collector/int_facet_collector.rs
+++ b/src/collector/int_facet_collector.rs
@@ -1,123 +0,0 @@
-use std::cmp::Eq;
-use std::collections::HashMap;
-use std::hash::Hash;
-
-use collector::Collector;
-use fastfield::FastFieldReader;
-use schema::Field;
-
-use DocId;
-use Result;
-use Score;
-use SegmentReader;
-use SegmentLocalId;
-
-
-/// Facet collector  for i64/u64 fast field
-pub struct IntFacetCollector<T>
-where
-    T: FastFieldReader,
-    T::ValueType: Eq + Hash,
-{
-    counters: HashMap<T::ValueType, u64>,
-    field: Field,
-    ff_reader: Option<T>,
-}
-
-
-impl<T> IntFacetCollector<T>
-where
-    T: FastFieldReader,
-    T::ValueType: Eq + Hash,
-{
-    /// Creates a new facet collector for aggregating a given field.
-    pub fn new(field: Field) -> IntFacetCollector<T> {
-        IntFacetCollector {
-            counters: HashMap::new(),
-            field: field,
-            ff_reader: None,
-        }
-    }
-}
-
-
-impl<T> Collector for IntFacetCollector<T>
-where
-    T: FastFieldReader,
-    T::ValueType: Eq + Hash,
-{
-    fn set_segment(&mut self, _: SegmentLocalId, reader: &SegmentReader) -> Result<()> {
-        self.ff_reader = Some(reader.get_fast_field_reader(self.field)?);
-        Ok(())
-    }
-
-    fn collect(&mut self, doc: DocId, _: Score) {
-        let val = self.ff_reader
-            .as_ref()
-            .expect(
-                "collect() was called before set_segment. \
-                This should never happen.",
-            )
-            .get(doc);
-        *(self.counters.entry(val).or_insert(0)) += 1;
-    }
-}
-
-
-
-#[cfg(test)]
-mod tests {
-
-    use collector::{chain, IntFacetCollector};
-    use query::QueryParser;
-    use fastfield::{I64FastFieldReader, U64FastFieldReader};
-    use schema::{self, FAST, STRING};
-    use Index;
-
-    #[test]
-    // create 10 documents, set num field value to 0 or 1 for even/odd ones
-    // make sure we have facet counters correctly filled
-    fn test_facet_collector_results() {
-
-        let mut schema_builder = schema::SchemaBuilder::new();
-        let num_field_i64 = schema_builder.add_i64_field("num_i64", FAST);
-        let num_field_u64 = schema_builder.add_u64_field("num_u64", FAST);
-        let text_field = schema_builder.add_text_field("text", STRING);
-        let schema = schema_builder.build();
-
-        let index = Index::create_in_ram(schema.clone());
-
-        {
-            let mut index_writer = index.writer_with_num_threads(1, 40_000_000).unwrap();
-            {
-                for i in 0u64..10u64 {
-                    index_writer.add_document(doc!(
-                        num_field_i64 => ((i as i64) % 3i64) as i64,
-                        num_field_u64 => (i % 2u64) as u64,
-                        text_field => "text"
-                    ));
-                }
-            }
-            assert_eq!(index_writer.commit().unwrap(), 10u64);
-        }
-
-        index.load_searchers().unwrap();
-        let searcher = index.searcher();
-        let mut ffvf_i64: IntFacetCollector<I64FastFieldReader> = IntFacetCollector::new(num_field_i64);
-        let mut ffvf_u64: IntFacetCollector<U64FastFieldReader> = IntFacetCollector::new(num_field_u64);
-
-        {
-            // perform the query
-            let mut facet_collectors = chain().push(&mut ffvf_i64).push(&mut ffvf_u64);
-            let mut query_parser = QueryParser::for_index(index, vec![text_field]);
-            let query = query_parser.parse_query("text:text").unwrap();
-            query.search(&searcher, &mut facet_collectors).unwrap();
-        }
-
-        assert_eq!(ffvf_u64.counters[&0], 5);
-        assert_eq!(ffvf_u64.counters[&1], 5);
-        assert_eq!(ffvf_i64.counters[&0], 4);
-        assert_eq!(ffvf_i64.counters[&1], 3);
-
-    }
-}
--- a/src/collector/mod.rs
+++ b/src/collector/mod.rs
@@ -62,9 +62,6 @@ pub trait Collector {
    ) -> Result<()>;
    /// The query pushes the scored document to the collector via this method.
    fn collect(&mut self, doc: DocId, score: Score);
-
-    /// Returns true iff the collector requires to compute scores for documents.
-    fn requires_scoring(&self) -> bool;
 }

 impl<'a, C: Collector> Collector for &'a mut C {
@@ -77,11 +74,7 @@ impl<'a, C: Collector> Collector for &'a mut C {
    }
    /// The query pushes the scored document to the collector via this method.
    fn collect(&mut self, doc: DocId, score: Score) {
-        C::collect(self, doc, score)
-    }
-
-    fn requires_scoring(&self) -> bool {
-        C::requires_scoring(self)
+        (*self).collect(doc, score);
    }
 }

@@ -94,6 +87,7 @@ pub mod tests {
    use Score;
    use core::SegmentReader;
    use SegmentLocalId;
+    use fastfield::U64FastFieldReader;
    use fastfield::FastFieldReader;
    use schema::Field;

@@ -105,7 +99,6 @@ pub mod tests {
        offset: DocId,
        segment_max_doc: DocId,
        docs: Vec<DocId>,
-        scores: Vec<Score>,
    }

    impl TestCollector {
@@ -113,19 +106,14 @@ pub mod tests {
        pub fn docs(self) -> Vec<DocId> {
            self.docs
        }
-
-        pub fn scores(self) -> Vec<Score> {
-            self.scores
-        }
    }

    impl Default for TestCollector {
        fn default() -> TestCollector {
            TestCollector {
+                docs: Vec::new(),
                offset: 0,
                segment_max_doc: 0,
-                docs: Vec::new(),
-                scores: Vec::new(),
            }
        }
    }
@@ -137,13 +125,8 @@ pub mod tests {
            Ok(())
        }

-        fn collect(&mut self, doc: DocId, score: Score) {
+        fn collect(&mut self, doc: DocId, _score: Score) {
            self.docs.push(doc + self.offset);
-            self.scores.push(score);
-        }
-
-        fn requires_scoring(&self) -> bool {
-            true
        }
    }

@@ -154,14 +137,14 @@ pub mod tests {
    pub struct FastFieldTestCollector {
        vals: Vec<u64>,
        field: Field,
-        ff_reader: Option<FastFieldReader<u64>>,
+        ff_reader: Option<U64FastFieldReader>,
    }

    impl FastFieldTestCollector {
        pub fn for_field(field: Field) -> FastFieldTestCollector {
            FastFieldTestCollector {
                vals: Vec::new(),
-                field,
+                field: field,
                ff_reader: None,
            }
        }
@@ -173,7 +156,7 @@ pub mod tests {

    impl Collector for FastFieldTestCollector {
        fn set_segment(&mut self, _: SegmentLocalId, reader: &SegmentReader) -> Result<()> {
-            self.ff_reader = Some(reader.fast_field_reader(self.field)?);
+            self.ff_reader = Some(reader.get_fast_field_reader(self.field)?);
            Ok(())
        }

@@ -181,9 +164,6 @@ pub mod tests {
            let val = self.ff_reader.as_ref().unwrap().get(doc);
            self.vals.push(val);
        }
-        fn requires_scoring(&self) -> bool {
-            false
-        }
    }

    #[bench]
--- a/src/collector/multi_collector.rs
+++ b/src/collector/multi_collector.rs
@@ -16,7 +16,9 @@ pub struct MultiCollector<'a> {
 impl<'a> MultiCollector<'a> {
    /// Constructor
    pub fn from(collectors: Vec<&'a mut Collector>) -> MultiCollector {
-        MultiCollector { collectors }
+        MultiCollector {
+            collectors: collectors,
+        }
    }
 }

@@ -27,7 +29,7 @@ impl<'a> Collector for MultiCollector<'a> {
        segment: &SegmentReader,
    ) -> Result<()> {
        for collector in &mut self.collectors {
-            collector.set_segment(segment_local_id, segment)?;
+            try!(collector.set_segment(segment_local_id, segment));
        }
        Ok(())
    }
@@ -37,11 +39,6 @@ impl<'a> Collector for MultiCollector<'a> {
            collector.collect(doc, score);
        }
    }
-    fn requires_scoring(&self) -> bool {
-        self.collectors
-            .iter()
-            .any(|collector| collector.requires_scoring())
-    }
 }

 #[cfg(test)]
--- a/src/collector/top_collector.rs
+++ b/src/collector/top_collector.rs
@@ -60,7 +60,7 @@ impl TopCollector {
            panic!("Limit must be strictly greater than 0.");
        }
        TopCollector {
-            limit,
+            limit: limit,
            heap: BinaryHeap::with_capacity(limit),
            segment_id: 0,
        }
@@ -119,16 +119,12 @@ impl Collector for TopCollector {
            }
        } else {
            let wrapped_doc = GlobalScoredDoc {
-                score,
+                score: score,
                doc_address: DocAddress(self.segment_id, doc),
            };
            self.heap.push(wrapped_doc);
        }
    }
-
-    fn requires_scoring(&self) -> bool {
-        true
-    }
 }

 #[cfg(test)]
--- a/src/common/bitpacker.rs
+++ b/src/common/bitpacker.rs
@@ -3,37 +3,65 @@ use std::io;
 use common::serialize::BinarySerializable;
 use std::mem;
 use std::ops::Deref;
-use std::ptr;

-pub(crate) struct BitPacker {
+/// Computes the number of bits that will be used for bitpacking.
+///
+/// In general the target is the minimum number of bits
+/// required to express the amplitude given in argument.
+///
+/// e.g. If the amplitude is 10, we can store all ints on simply 4bits.
+///
+/// The logic is slightly more convoluted here as for optimization
+/// reasons, we want to ensure that a value spawns over at most 8 bytes
+/// of aligns bytes.
+///
+/// Spanning over 9 bytes is possible for instance, if we do
+/// bitpacking with an amplitude of 63 bits.
+/// In this case, the second int will start on bit
+/// 63 (which belongs to byte 7) and ends at byte 15;
+/// Hence 9 bytes (from byte 7 to byte 15 included).
+///
+/// To avoid this, we force the number of bits to 64bits
+/// when the result is greater than `64-8 = 56 bits`.
+///
+/// Note that this only affects rare use cases spawning over
+/// a very large range of values. Even in this case, it results
+/// in an extra cost of at most 12% compared to the optimal
+/// number of bits.
+pub fn compute_num_bits(amplitude: u64) -> u8 {
+    let amplitude = (64u32 - amplitude.leading_zeros()) as u8;
+    if amplitude <= 64 - 8 {
+        amplitude
+    } else {
+        64
+    }
+}
+
+pub struct BitPacker {
    mini_buffer: u64,
    mini_buffer_written: usize,
+    num_bits: usize,
 }

 impl BitPacker {
-    pub fn new() -> BitPacker {
+    pub fn new(num_bits: usize) -> BitPacker {
        BitPacker {
            mini_buffer: 0u64,
            mini_buffer_written: 0,
+            num_bits,
        }
    }

-    pub fn write<TWrite: Write>(
-        &mut self,
-        val: u64,
-        num_bits: u8,
-        output: &mut TWrite,
-    ) -> io::Result<()> {
+    pub fn write<TWrite: Write>(&mut self, val: u64, output: &mut TWrite) -> io::Result<()> {
        let val_u64 = val as u64;
-        let num_bits = num_bits as usize;
-        if self.mini_buffer_written + num_bits > 64 {
+        if self.mini_buffer_written + self.num_bits > 64 {
            self.mini_buffer |= val_u64.wrapping_shl(self.mini_buffer_written as u32);
            self.mini_buffer.serialize(output)?;
            self.mini_buffer = val_u64.wrapping_shr((64 - self.mini_buffer_written) as u32);
-            self.mini_buffer_written = self.mini_buffer_written + num_bits - 64;
+            self.mini_buffer_written = self.mini_buffer_written + (self.num_bits as usize) - 64;
        } else {
            self.mini_buffer |= val_u64 << self.mini_buffer_written;
-            self.mini_buffer_written += num_bits;
+            self.mini_buffer_written += self.num_bits;
            if self.mini_buffer_written == 64 {
                self.mini_buffer.serialize(output)?;
                self.mini_buffer_written = 0;
@@ -43,7 +71,7 @@ impl BitPacker {
        Ok(())
    }

-    pub fn flush<TWrite: Write>(&mut self, output: &mut TWrite) -> io::Result<()> {
+    pub(crate) fn flush<TWrite: Write>(&mut self, output: &mut TWrite) -> io::Result<()> {
        if self.mini_buffer_written > 0 {
            let num_bytes = (self.mini_buffer_written + 7) / 8;
            let arr: [u8; 8] = unsafe { mem::transmute::<u64, [u8; 8]>(self.mini_buffer) };
@@ -61,7 +89,6 @@ impl BitPacker {
    }
 }

-#[derive(Clone)]
 pub struct BitUnpacker<Data>
 where
    Data: Deref<Target = [u8]>,
@@ -75,14 +102,14 @@ impl<Data> BitUnpacker<Data>
 where
    Data: Deref<Target = [u8]>,
 {
-    pub fn new(data: Data, num_bits: u8) -> BitUnpacker<Data> {
+    pub fn new(data: Data, num_bits: usize) -> BitUnpacker<Data> {
        let mask: u64 = if num_bits == 64 {
            !0u64
        } else {
            (1u64 << num_bits) - 1u64
        };
        BitUnpacker {
-            num_bits: num_bits as usize,
+            num_bits,
            mask,
            data,
        }
@@ -90,7 +117,7 @@ where

    pub fn get(&self, idx: usize) -> u64 {
        if self.num_bits == 0 {
-            return 0u64;
+            return 0;
        }
        let data: &[u8] = &*self.data;
        let num_bits = self.num_bits;
@@ -106,32 +133,29 @@ where
                addr + 8 <= data.len(),
                "The fast field field should have been padded with 7 bytes."
            );
-            let val_unshifted_unmasked: u64 = unsafe { ptr::read_unaligned(data[addr..].as_ptr() as *const u64) };
+            let val_unshifted_unmasked: u64 =
+                unsafe { *(data[addr..].as_ptr() as *const u64) };
            let val_shifted = (val_unshifted_unmasked >> bit_shift) as u64;
-            val_shifted & mask
+            (val_shifted & mask)
        } else {
            let val_unshifted_unmasked: u64 = if addr + 8 <= data.len() {
-                unsafe { ptr::read_unaligned(data[addr..].as_ptr() as *const u64) }
+                unsafe { *(data[addr..].as_ptr() as *const u64) }
            } else {
                let mut buffer = [0u8; 8];
                for i in addr..data.len() {
                    buffer[i - addr] += data[i];
                }
-                unsafe { ptr::read_unaligned(buffer[..].as_ptr() as *const u64) }
+                unsafe { *(buffer[..].as_ptr() as *const u64) }
            };
-            let val_shifted = val_unshifted_unmasked >> (bit_shift as u64);
-            val_shifted & mask
+            let val_shifted = (val_unshifted_unmasked >> bit_shift) as u64;
+            (val_shifted & mask)
        }
    }

-    /// Reads a range of values from the fast field.
-    ///
-    /// The range of values read is from
-    /// `[start..start + output.len()[`
    pub fn get_range(&self, start: u32, output: &mut [u64]) {
        if self.num_bits == 0 {
            for val in output.iter_mut() {
-                *val = 0u64;
+                *val = 0;
            }
        } else {
            let data: &[u8] = &*self.data;
@@ -141,7 +165,8 @@ where
            for output_val in output.iter_mut() {
                let addr = addr_in_bits >> 3;
                let bit_shift = addr_in_bits & 7;
-                let val_unshifted_unmasked: u64 = unsafe { ptr::read_unaligned(data[addr..].as_ptr() as *const u64) };
+                let val_unshifted_unmasked: u64 =
+                    unsafe { *(data[addr..].as_ptr() as *const u64) };
                let val_shifted = (val_unshifted_unmasked >> bit_shift) as u64;
                *output_val = val_shifted & mask;
                addr_in_bits += num_bits;
@@ -152,25 +177,37 @@ where

 #[cfg(test)]
 mod test {
-    use super::{BitPacker, BitUnpacker};
+    use super::{compute_num_bits, BitPacker, BitUnpacker};

-    fn create_fastfield_bitpacker(len: usize, num_bits: u8) -> (BitUnpacker<Vec<u8>>, Vec<u64>) {
+    #[test]
+    fn test_compute_num_bits() {
+        assert_eq!(compute_num_bits(1), 1u8);
+        assert_eq!(compute_num_bits(0), 0u8);
+        assert_eq!(compute_num_bits(2), 2u8);
+        assert_eq!(compute_num_bits(3), 2u8);
+        assert_eq!(compute_num_bits(4), 3u8);
+        assert_eq!(compute_num_bits(255), 8u8);
+        assert_eq!(compute_num_bits(256), 9u8);
+        assert_eq!(compute_num_bits(5_000_000_000), 33u8);
+    }
+
+    fn create_fastfield_bitpacker(len: usize, num_bits: usize) -> (BitUnpacker<Vec<u8>>, Vec<u64>) {
        let mut data = Vec::new();
-        let mut bitpacker = BitPacker::new();
-        let max_val: u64 = (1u64 << num_bits as u64) - 1u64;
+        let mut bitpacker = BitPacker::new(num_bits);
+        let max_val: u64 = (1 << num_bits) - 1;
        let vals: Vec<u64> = (0u64..len as u64)
            .map(|i| if max_val == 0 { 0 } else { i % max_val })
            .collect();
        for &val in &vals {
-            bitpacker.write(val, num_bits, &mut data).unwrap();
+            bitpacker.write(val, &mut data).unwrap();
        }
        bitpacker.close(&mut data).unwrap();
-        assert_eq!(data.len(), ((num_bits as usize) * len + 7) / 8 + 7);
+        assert_eq!(data.len(), (num_bits * len + 7) / 8 + 7);
        let bitunpacker = BitUnpacker::new(data, num_bits);
        (bitunpacker, vals)
    }

-    fn test_bitpacker_util(len: usize, num_bits: u8) {
+    fn test_bitpacker_util(len: usize, num_bits: usize) {
        let (bitunpacker, vals) = create_fastfield_bitpacker(len, num_bits);
        for (i, val) in vals.iter().enumerate() {
            assert_eq!(bitunpacker.get(i), *val);
--- a/src/common/bitset.rs
+++ b/src/common/bitset.rs
@@ -1,389 +0,0 @@
-use std::fmt;
-use std::u64;
-
-#[derive(Clone, Copy, Eq, PartialEq)]
-pub(crate) struct TinySet(u64);
-
-impl fmt::Debug for TinySet {
-    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
-        self.into_iter().collect::<Vec<u32>>().fmt(f)
-    }
-}
-
-pub struct TinySetIterator(TinySet);
-impl Iterator for TinySetIterator {
-    type Item = u32;
-
-    fn next(&mut self) -> Option<Self::Item> {
-        self.0.pop_lowest()
-    }
-}
-
-impl IntoIterator for TinySet {
-    type Item = u32;
-    type IntoIter = TinySetIterator;
-    fn into_iter(self) -> Self::IntoIter {
-        TinySetIterator(self)
-    }
-}
-
-impl TinySet {
-    /// Returns an empty `TinySet`.
-    pub fn empty() -> TinySet {
-        TinySet(0u64)
-    }
-
-    /// Returns the complement of the set in `[0, 64[`.
-    fn complement(&self) -> TinySet {
-        TinySet(!self.0)
-    }
-
-    /// Returns true iff the `TinySet` contains the element `el`.
-    pub fn contains(&self, el: u32) -> bool {
-        !self.intersect(TinySet::singleton(el)).is_empty()
-    }
-
-    /// Returns the intersection of `self` and `other`
-    pub fn intersect(&self, other: TinySet) -> TinySet {
-        TinySet(self.0 & other.0)
-    }
-
-    /// Creates a new `TinySet` containing only one element
-    /// within `[0; 64[`
-    #[inline(always)]
-    pub fn singleton(el: u32) -> TinySet {
-        TinySet(1u64 << u64::from(el))
-    }
-
-    /// Insert a new element within [0..64[
-    #[inline(always)]
-    pub fn insert(self, el: u32) -> TinySet {
-        self.union(TinySet::singleton(el))
-    }
-
-    /// Insert a new element within [0..64[
-    #[inline(always)]
-    pub fn insert_mut(&mut self, el: u32) -> bool {
-        let old = *self;
-        *self = old.insert(el);
-        old != *self
-    }
-
-    /// Returns the union of two tinysets
-    #[inline(always)]
-    pub fn union(self, other: TinySet) -> TinySet {
-        TinySet(self.0 | other.0)
-    }
-
-    /// Returns true iff the `TinySet` is empty.
-    #[inline(always)]
-    pub fn is_empty(&self) -> bool {
-        self.0 == 0u64
-    }
-
-    /// Returns the lowest element in the `TinySet`
-    /// and removes it.
-    #[inline(always)]
-    pub fn pop_lowest(&mut self) -> Option<u32> {
-        if self.is_empty() {
-            None
-        } else {
-            let lowest = self.0.trailing_zeros() as u32;
-            self.0 ^= TinySet::singleton(lowest).0;
-            Some(lowest)
-        }
-    }
-
-    /// Returns a `TinySet` than contains all values up
-    /// to limit excluded.
-    ///
-    /// The limit is assumed to be strictly lower than 64.
-    pub fn range_lower(upper_bound: u32) -> TinySet {
-        TinySet((1u64 << u64::from(upper_bound % 64u32)) - 1u64)
-    }
-
-    /// Returns a `TinySet` that contains all values greater
-    /// or equal to the given limit, included. (and up to 63)
-    ///
-    /// The limit is assumed to be strictly lower than 64.
-    pub fn range_greater_or_equal(from_included: u32) -> TinySet {
-        TinySet::range_lower(from_included).complement()
-    }
-
-    pub fn clear(&mut self) {
-        self.0 = 0u64;
-    }
-
-    pub fn len(&self) -> u32 {
-        self.0.count_ones()
-    }
-}
-
-#[derive(Clone)]
-pub struct BitSet {
-    tinysets: Box<[TinySet]>,
-    len: usize, //< Technically it should be u32, but we
-    // count multiple inserts.
-    // `usize` guards us from overflow.
-    max_value: u32,
-}
-
-fn num_buckets(max_val: u32) -> u32 {
-    (max_val + 63u32) / 64u32
-}
-
-impl BitSet {
-    /// Create a new `BitSet` that may contain elements
-    /// within `[0, max_val[`.
-    pub fn with_max_value(max_value: u32) -> BitSet {
-        let num_buckets = num_buckets(max_value);
-        let tinybisets = vec![TinySet::empty(); num_buckets as usize].into_boxed_slice();
-        BitSet {
-            tinysets: tinybisets,
-            len: 0,
-            max_value,
-        }
-    }
-
-    /// Removes all elements from the `BitSet`.
-    pub fn clear(&mut self) {
-        for tinyset in self.tinysets.iter_mut() {
-            *tinyset = TinySet::empty();
-        }
-    }
-
-    /// Returns the number of elements in the `BitSet`.
-    pub fn len(&self) -> usize {
-        self.len
-    }
-
-    /// Inserts an element in the `BitSet`
-    pub fn insert(&mut self, el: u32) {
-        // we do not check saturated els.
-        let higher = el / 64u32;
-        let lower = el % 64u32;
-        self.len += if self.tinysets[higher as usize].insert_mut(lower) {
-            1
-        } else {
-            0
-        };
-    }
-
-    /// Returns true iff the elements is in the `BitSet`.
-    pub fn contains(&self, el: u32) -> bool {
-        self.tinyset(el / 64u32).contains(el % 64)
-    }
-
-    /// Returns the first non-empty `TinySet` associated to a bucket lower
-    /// or greater than bucket.
-    ///
-    /// Reminder: the tiny set with the bucket `bucket`, represents the
-    /// elements from `bucket * 64` to `(bucket+1) * 64`.
-    pub(crate) fn first_non_empty_bucket(&self, bucket: u32) -> Option<u32> {
-        self.tinysets[bucket as usize..]
-            .iter()
-            .cloned()
-            .position(|tinyset| !tinyset.is_empty())
-            .map(|delta_bucket| bucket + delta_bucket as u32)
-    }
-
-    pub fn max_value(&self) -> u32 {
-        self.max_value
-    }
-
-    /// Returns the tiny bitset representing the
-    /// the set restricted to the number range from
-    /// `bucket * 64` to `(bucket + 1) * 64`.
-    pub(crate) fn tinyset(&self, bucket: u32) -> TinySet {
-        self.tinysets[bucket as usize]
-    }
-}
-
-#[cfg(test)]
-mod tests {
-
-    extern crate test;
-    use tests;
-    use std::collections::HashSet;
-    use super::BitSet;
-    use super::TinySet;
-    use tests::generate_nonunique_unsorted;
-    use std::collections::BTreeSet;
-    use query::BitSetDocSet;
-    use docset::DocSet;
-
-    #[test]
-    fn test_tiny_set() {
-        assert!(TinySet::empty().is_empty());
-        {
-            let mut u = TinySet::empty().insert(1u32);
-            assert_eq!(u.pop_lowest(), Some(1u32));
-            assert!(u.pop_lowest().is_none())
-        }
-        {
-            let mut u = TinySet::empty().insert(1u32).insert(1u32);
-            assert_eq!(u.pop_lowest(), Some(1u32));
-            assert!(u.pop_lowest().is_none())
-        }
-        {
-            let mut u = TinySet::empty().insert(2u32);
-            assert_eq!(u.pop_lowest(), Some(2u32));
-            u.insert_mut(1u32);
-            assert_eq!(u.pop_lowest(), Some(1u32));
-            assert!(u.pop_lowest().is_none());
-        }
-        {
-            let mut u = TinySet::empty().insert(63u32);
-            assert_eq!(u.pop_lowest(), Some(63u32));
-            assert!(u.pop_lowest().is_none());
-        }
-    }
-
-    #[test]
-    fn test_bitset() {
-        let test_against_hashset = |els: &[u32], max_value: u32| {
-            let mut hashset: HashSet<u32> = HashSet::new();
-            let mut bitset = BitSet::with_max_value(max_value);
-            for &el in els {
-                assert!(el < max_value);
-                hashset.insert(el);
-                bitset.insert(el);
-            }
-            for el in 0..max_value {
-                assert_eq!(hashset.contains(&el), bitset.contains(el));
-            }
-            assert_eq!(bitset.max_value(), max_value);
-        };
-
-        test_against_hashset(&[], 0);
-        test_against_hashset(&[], 1);
-        test_against_hashset(&[0u32], 1);
-        test_against_hashset(&[0u32], 100);
-        test_against_hashset(&[1u32, 2u32], 4);
-        test_against_hashset(&[99u32], 100);
-        test_against_hashset(&[63u32], 64);
-        test_against_hashset(&[62u32, 63u32], 64);
-    }
-
-    #[test]
-    fn test_bitset_large() {
-        let arr = generate_nonunique_unsorted(1_000_000, 50_000);
-        let mut btreeset: BTreeSet<u32> = BTreeSet::new();
-        let mut bitset = BitSet::with_max_value(1_000_000);
-        for el in arr {
-            btreeset.insert(el);
-            bitset.insert(el);
-        }
-        for i in 0..1_000_000 {
-            assert_eq!(btreeset.contains(&i), bitset.contains(i));
-        }
-        assert_eq!(btreeset.len(), bitset.len());
-        let mut bitset_docset = BitSetDocSet::from(bitset);
-        for el in btreeset.into_iter() {
-            bitset_docset.advance();
-            assert_eq!(bitset_docset.doc(), el);
-        }
-        assert!(!bitset_docset.advance());
-    }
-
-    #[test]
-    fn test_bitset_num_buckets() {
-        use super::num_buckets;
-        assert_eq!(num_buckets(0u32), 0);
-        assert_eq!(num_buckets(1u32), 1);
-        assert_eq!(num_buckets(64u32), 1);
-        assert_eq!(num_buckets(65u32), 2);
-        assert_eq!(num_buckets(128u32), 2);
-        assert_eq!(num_buckets(129u32), 3);
-    }
-
-    #[test]
-    fn test_tinyset_range() {
-        assert_eq!(
-            TinySet::range_lower(3).into_iter().collect::<Vec<u32>>(),
-            [0, 1, 2]
-        );
-        assert!(TinySet::range_lower(0).is_empty());
-        assert_eq!(
-            TinySet::range_lower(63).into_iter().collect::<Vec<u32>>(),
-            (0u32..63u32).collect::<Vec<_>>()
-        );
-        assert_eq!(
-            TinySet::range_lower(1).into_iter().collect::<Vec<u32>>(),
-            [0]
-        );
-        assert_eq!(
-            TinySet::range_lower(2).into_iter().collect::<Vec<u32>>(),
-            [0, 1]
-        );
-        assert_eq!(
-            TinySet::range_greater_or_equal(3)
-                .into_iter()
-                .collect::<Vec<u32>>(),
-            (3u32..64u32).collect::<Vec<_>>()
-        );
-    }
-
-    #[test]
-    fn test_bitset_len() {
-        let mut bitset = BitSet::with_max_value(1_000);
-        assert_eq!(bitset.len(), 0);
-        bitset.insert(3u32);
-        assert_eq!(bitset.len(), 1);
-        bitset.insert(103u32);
-        assert_eq!(bitset.len(), 2);
-        bitset.insert(3u32);
-        assert_eq!(bitset.len(), 2);
-        bitset.insert(103u32);
-        assert_eq!(bitset.len(), 2);
-        bitset.insert(104u32);
-        assert_eq!(bitset.len(), 3);
-    }
-
-    #[test]
-    fn test_bitset_clear() {
-        let mut bitset = BitSet::with_max_value(1_000);
-        let els = tests::sample(1_000, 0.01f32);
-        for &el in &els {
-            bitset.insert(el);
-        }
-        assert!(els.iter().all(|el| bitset.contains(*el)));
-        bitset.clear();
-        for el in 0u32..1000u32 {
-            assert!(!bitset.contains(el));
-        }
-    }
-
-    #[bench]
-    fn bench_tinyset_pop(b: &mut test::Bencher) {
-        b.iter(|| {
-            let mut tinyset = TinySet::singleton(test::black_box(31u32));
-            tinyset.pop_lowest();
-            tinyset.pop_lowest();
-            tinyset.pop_lowest();
-            tinyset.pop_lowest();
-            tinyset.pop_lowest();
-            tinyset.pop_lowest();
-        });
-    }
-
-    #[bench]
-    fn bench_tinyset_sum(b: &mut test::Bencher) {
-        let tiny_set = TinySet::empty().insert(10u32).insert(14u32).insert(21u32);
-        b.iter(|| {
-            assert_eq!(test::black_box(tiny_set).into_iter().sum::<u32>(), 45u32);
-        });
-    }
-
-    #[bench]
-    fn bench_tinyarr_sum(b: &mut test::Bencher) {
-        let v = [10u32, 14u32, 21u32];
-        b.iter(|| test::black_box(v).iter().cloned().sum::<u32>());
-    }
-
-    #[bench]
-    fn bench_bitset_initialize(b: &mut test::Bencher) {
-        b.iter(|| BitSet::with_max_value(1_000_000));
-    }
-
-}
--- a/src/common/composite_file.rs
+++ b/src/common/composite_file.rs
@@ -4,43 +4,14 @@ use std::collections::HashMap;
 use schema::Field;
 use common::VInt;
 use directory::WritePtr;
-use std::io::{self, Read};
+use std::io;
 use directory::ReadOnlySource;
 use common::BinarySerializable;

-#[derive(Eq, PartialEq, Hash, Copy, Ord, PartialOrd, Clone, Debug)]
-pub struct FileAddr {
-    field: Field,
-    idx: usize,
-}
-
-impl FileAddr {
-    fn new(field: Field, idx: usize) -> FileAddr {
-        FileAddr { field, idx }
-    }
-}
-
-impl BinarySerializable for FileAddr {
-    fn serialize<W: Write>(&self, writer: &mut W) -> io::Result<()> {
-        self.field.serialize(writer)?;
-        VInt(self.idx as u64).serialize(writer)?;
-        Ok(())
-    }
-
-    fn deserialize<R: Read>(reader: &mut R) -> io::Result<Self> {
-        let field = Field::deserialize(reader)?;
-        let idx = VInt::deserialize(reader)?.0 as usize;
-        Ok(FileAddr {
-            field,
-            idx,
-        })
-    }
-}
-
 /// A `CompositeWrite` is used to write a `CompositeFile`.
 pub struct CompositeWrite<W = WritePtr> {
    write: CountingWriter<W>,
-    offsets: HashMap<FileAddr, usize>,
+    offsets: HashMap<Field, usize>,
 }

 impl<W: Write> CompositeWrite<W> {
@@ -55,15 +26,9 @@ impl<W: Write> CompositeWrite<W> {

    /// Start writing a new field.
    pub fn for_field(&mut self, field: Field) -> &mut CountingWriter<W> {
-        self.for_field_with_idx(field, 0)
-    }
-
-    /// Start writing a new field.
-    pub fn for_field_with_idx(&mut self, field: Field, idx: usize) -> &mut CountingWriter<W> {
        let offset = self.write.written_bytes();
-        let file_addr = FileAddr::new(field, idx);
-        assert!(!self.offsets.contains_key(&file_addr));
-        self.offsets.insert(file_addr, offset);
+        assert!(!self.offsets.contains_key(&field));
+        self.offsets.insert(field, offset);
        &mut self.write
    }

@@ -77,16 +42,16 @@ impl<W: Write> CompositeWrite<W> {

        let mut offset_fields: Vec<_> = self.offsets
            .iter()
-            .map(|(file_addr, offset)| (*offset, *file_addr))
+            .map(|(field, offset)| (offset, field))
            .collect();

        offset_fields.sort();

        let mut prev_offset = 0;
-        for (offset, file_addr) in offset_fields {
+        for (offset, field) in offset_fields {
            VInt((offset - prev_offset) as u64).serialize(&mut self.write)?;
-            file_addr.serialize(&mut self.write)?;
-            prev_offset = offset;
+            field.serialize(&mut self.write)?;
+            prev_offset = *offset;
        }

        let footer_len = (self.write.written_bytes() - footer_offset) as u32;
@@ -105,7 +70,7 @@ impl<W: Write> CompositeWrite<W> {
 #[derive(Clone)]
 pub struct CompositeFile {
    data: ReadOnlySource,
-    offsets_index: HashMap<FileAddr, (usize, usize)>,
+    offsets_index: HashMap<Field, (usize, usize)>,
 }

 impl CompositeFile {
@@ -121,7 +86,7 @@ impl CompositeFile {
        let mut footer_buffer = footer_data.as_slice();
        let num_fields = VInt::deserialize(&mut footer_buffer)?.0 as usize;

-        let mut file_addrs = vec![];
+        let mut fields = vec![];
        let mut offsets = vec![];

        let mut field_index = HashMap::new();
@@ -129,16 +94,16 @@ impl CompositeFile {
        let mut offset = 0;
        for _ in 0..num_fields {
            offset += VInt::deserialize(&mut footer_buffer)?.0 as usize;
-            let file_addr = FileAddr::deserialize(&mut footer_buffer)?;
+            let field = Field::deserialize(&mut footer_buffer)?;
            offsets.push(offset);
-            file_addrs.push(file_addr);
+            fields.push(field);
        }
        offsets.push(footer_start);
        for i in 0..num_fields {
-            let file_addr = file_addrs[i];
+            let field = fields[i];
            let start_offset = offsets[i];
            let end_offset = offsets[i + 1];
-            field_index.insert(file_addr, (start_offset, end_offset));
+            field_index.insert(field, (start_offset, end_offset));
        }

        Ok(CompositeFile {
@@ -159,14 +124,8 @@ impl CompositeFile {
    /// Returns the `ReadOnlySource` associated
    /// to a given `Field` and stored in a `CompositeFile`.
    pub fn open_read(&self, field: Field) -> Option<ReadOnlySource> {
-        self.open_read_with_idx(field, 0)
-    }
-
-    /// Returns the `ReadOnlySource` associated
-    /// to a given `Field` and stored in a `CompositeFile`.
-    pub fn open_read_with_idx(&self, field: Field, idx: usize) -> Option<ReadOnlySource> {
        self.offsets_index
-            .get(&FileAddr { field, idx, })
+            .get(&field)
            .map(|&(from, to)| self.data.slice(from, to))
    }
 }
--- a/src/common/mod.rs
+++ b/src/common/mod.rs
@@ -1,59 +1,22 @@
-    mod serialize;
+mod serialize;
+mod timer;
 mod vint;
 mod counting_writer;
 mod composite_file;
 pub mod bitpacker;
-mod bitset;

 pub(crate) use self::composite_file::{CompositeFile, CompositeWrite};
-pub use self::serialize::{BinarySerializable, FixedSize};
+pub use self::serialize::BinarySerializable;
+pub use self::timer::Timing;
+pub use self::timer::TimerTree;
+pub use self::timer::OpenTimer;
 pub use self::vint::VInt;
 pub use self::counting_writer::CountingWriter;
-pub use self::bitset::BitSet;
-pub(crate) use self::bitset::TinySet;
-pub use byteorder::LittleEndian as Endianness;

 use std::io;

-/// Computes the number of bits that will be used for bitpacking.
-///
-/// In general the target is the minimum number of bits
-/// required to express the amplitude given in argument.
-///
-/// e.g. If the amplitude is 10, we can store all ints on simply 4bits.
-///
-/// The logic is slightly more convoluted here as for optimization
-/// reasons, we want to ensure that a value spawns over at most 8 bytes
-/// of aligns bytes.
-///
-/// Spanning over 9 bytes is possible for instance, if we do
-/// bitpacking with an amplitude of 63 bits.
-/// In this case, the second int will start on bit
-/// 63 (which belongs to byte 7) and ends at byte 15;
-/// Hence 9 bytes (from byte 7 to byte 15 included).
-///
-/// To avoid this, we force the number of bits to 64bits
-/// when the result is greater than `64-8 = 56 bits`.
-///
-/// Note that this only affects rare use cases spawning over
-/// a very large range of values. Even in this case, it results
-/// in an extra cost of at most 12% compared to the optimal
-/// number of bits.
-pub(crate) fn compute_num_bits(n: u64) -> u8 {
-    let amplitude = (64u32 - n.leading_zeros()) as u8;
-    if amplitude <= 64 - 8 {
-        amplitude
-    } else {
-        64
-    }
-}
-
-pub(crate) fn is_power_of_2(n: usize) -> bool {
-    (n > 0) && (n & (n - 1) == 0)
-}
-
 /// Create a default io error given a string.
-pub(crate) fn make_io_err(msg: String) -> io::Error {
+pub fn make_io_err(msg: String) -> io::Error {
    io::Error::new(io::ErrorKind::Other, msg)
 }

@@ -102,10 +65,9 @@ pub fn u64_to_i64(val: u64) -> i64 {
 }

 #[cfg(test)]
-pub(crate) mod test {
+mod test {

-    use super::{compute_num_bits, i64_to_u64, u64_to_i64};
-    pub use super::serialize::test::fixed_size_test;
+    use super::{i64_to_u64, u64_to_i64};

    fn test_i64_converter_helper(val: i64) {
        assert_eq!(u64_to_i64(i64_to_u64(val)), val);
@@ -122,16 +84,4 @@ pub(crate) mod test {
            test_i64_converter_helper(i);
        }
    }
-
-    #[test]
-    fn test_compute_num_bits() {
-        assert_eq!(compute_num_bits(1), 1u8);
-        assert_eq!(compute_num_bits(0), 0u8);
-        assert_eq!(compute_num_bits(2), 2u8);
-        assert_eq!(compute_num_bits(3), 2u8);
-        assert_eq!(compute_num_bits(4), 3u8);
-        assert_eq!(compute_num_bits(255), 8u8);
-        assert_eq!(compute_num_bits(256), 9u8);
-        assert_eq!(compute_num_bits(5_000_000_000), 33u8);
-    }
 }
--- a/src/common/serialize.rs
+++ b/src/common/serialize.rs
@@ -1,25 +1,16 @@
 use byteorder::{ReadBytesExt, WriteBytesExt};
-use common::Endianness;
+use byteorder::LittleEndian as Endianness;
 use std::fmt;
 use std::io::Write;
 use std::io::Read;
 use std::io;
 use common::VInt;

-/// Trait for a simple binary serialization.
 pub trait BinarySerializable: fmt::Debug + Sized {
-    /// Serialize
    fn serialize<W: Write>(&self, writer: &mut W) -> io::Result<()>;
-    /// Deserialize
    fn deserialize<R: Read>(reader: &mut R) -> io::Result<Self>;
 }

-/// `FixedSize` marks a `BinarySerializable` as
-/// always serializing to the same size.
-pub trait FixedSize: BinarySerializable {
-    const SIZE_IN_BYTES: usize;
-}
-
 impl BinarySerializable for () {
    fn serialize<W: Write>(&self, _: &mut W) -> io::Result<()> {
        Ok(())
@@ -29,10 +20,6 @@ impl BinarySerializable for () {
    }
 }

-impl FixedSize for () {
-    const SIZE_IN_BYTES: usize = 0;
-}
-
 impl<T: BinarySerializable> BinarySerializable for Vec<T> {
    fn serialize<W: Write>(&self, writer: &mut W) -> io::Result<()> {
        VInt(self.len() as u64).serialize(writer)?;
@@ -72,10 +59,6 @@ impl BinarySerializable for u32 {
    }
 }

-impl FixedSize for u32 {
-    const SIZE_IN_BYTES: usize = 4;
-}
-
 impl BinarySerializable for u64 {
    fn serialize<W: Write>(&self, writer: &mut W) -> io::Result<()> {
        writer.write_u64::<Endianness>(*self)
@@ -85,10 +68,6 @@ impl BinarySerializable for u64 {
    }
 }

-impl FixedSize for u64 {
-    const SIZE_IN_BYTES: usize = 8;
-}
-
 impl BinarySerializable for i64 {
    fn serialize<W: Write>(&self, writer: &mut W) -> io::Result<()> {
        writer.write_i64::<Endianness>(*self)
@@ -98,10 +77,6 @@ impl BinarySerializable for i64 {
    }
 }

-impl FixedSize for i64 {
-    const SIZE_IN_BYTES: usize = 8;
-}
-
 impl BinarySerializable for u8 {
    fn serialize<W: Write>(&self, writer: &mut W) -> io::Result<()> {
        writer.write_u8(*self)
@@ -111,10 +86,6 @@ impl BinarySerializable for u8 {
    }
 }

-impl FixedSize for u8 {
-    const SIZE_IN_BYTES: usize = 1;
-}
-
 impl BinarySerializable for String {
    fn serialize<W: Write>(&self, writer: &mut W) -> io::Result<()> {
        let data: &[u8] = self.as_bytes();
@@ -133,78 +104,63 @@ impl BinarySerializable for String {
 }

 #[cfg(test)]
-pub mod test {
+mod test {

    use common::VInt;
    use super::*;

-    pub fn fixed_size_test<O: BinarySerializable + FixedSize + Default>() {
-        let mut buffer = Vec::new();
-        O::default().serialize(&mut buffer).unwrap();
-        assert_eq!(buffer.len(), O::SIZE_IN_BYTES);
-    }
-
-    fn serialize_test<T: BinarySerializable + Eq>(v: T) -> usize {
+    fn serialize_test<T: BinarySerializable + Eq>(v: T, num_bytes: usize) {
        let mut buffer: Vec<u8> = Vec::new();
-        v.serialize(&mut buffer).unwrap();
-        let num_bytes = buffer.len();
+        if num_bytes != 0 {
+            v.serialize(&mut buffer).unwrap();
+            assert_eq!(buffer.len(), num_bytes);
+        } else {
+            v.serialize(&mut buffer).unwrap();
+        }
        let mut cursor = &buffer[..];
        let deser = T::deserialize(&mut cursor).unwrap();
        assert_eq!(deser, v);
-        num_bytes
    }

    #[test]
    fn test_serialize_u8() {
-        fixed_size_test::<u8>();
+        serialize_test(3u8, 1);
+        serialize_test(5u8, 1);
    }

    #[test]
    fn test_serialize_u32() {
-        fixed_size_test::<u32>();
-        assert_eq!(4, serialize_test(3u32));
-        assert_eq!(4, serialize_test(5u32));
-        assert_eq!(4, serialize_test(u32::max_value()));
-    }
-
-    #[test]
-    fn test_serialize_i64() {
-        fixed_size_test::<i64>();
-    }
-
-    #[test]
-    fn test_serialize_u64() {
-        fixed_size_test::<u64>();
+        serialize_test(3u32, 4);
+        serialize_test(5u32, 4);
+        serialize_test(u32::max_value(), 4);
    }

    #[test]
    fn test_serialize_string() {
-        assert_eq!(serialize_test(String::from("")), 1);
-        assert_eq!(serialize_test(String::from("ぽよぽよ")), 1 + 3 * 4);
-        assert_eq!(
-            serialize_test(String::from("富士さん見える。")),
-            1 + 3 * 8
-        );
+        serialize_test(String::from(""), 1);
+        serialize_test(String::from("ぽよぽよ"), 1 + 3 * 4);
+        serialize_test(String::from("富士さん見える。"), 1 + 3 * 8);
    }

    #[test]
    fn test_serialize_vec() {
-        assert_eq!(serialize_test(Vec::<u8>::new()), 1);
-        assert_eq!(serialize_test(vec![1u32, 3u32]), 1 + 4 * 2);
+        let v: Vec<u8> = Vec::new();
+        serialize_test(v, 1);
+        serialize_test(vec![1u32, 3u32], 1 + 4 * 2);
    }

    #[test]
    fn test_serialize_vint() {
        for i in 0..10_000 {
-            serialize_test(VInt(i as u64));
+            serialize_test(VInt(i as u64), 0);
        }
-        assert_eq!(serialize_test(VInt(7u64)), 1);
-        assert_eq!(serialize_test(VInt(127u64)), 1);
-        assert_eq!(serialize_test(VInt(128u64)), 2);
-        assert_eq!(serialize_test(VInt(129u64)), 2);
-        assert_eq!(serialize_test(VInt(1234u64)), 2);
-        assert_eq!(serialize_test(VInt(16_383u64)), 2);
-        assert_eq!(serialize_test(VInt(16_384u64)), 3);
-        assert_eq!(serialize_test(VInt(u64::max_value())), 10);
+        serialize_test(VInt(7u64), 1);
+        serialize_test(VInt(127u64), 1);
+        serialize_test(VInt(128u64), 2);
+        serialize_test(VInt(129u64), 2);
+        serialize_test(VInt(1234u64), 2);
+        serialize_test(VInt(16_383), 2);
+        serialize_test(VInt(16_384), 3);
+        serialize_test(VInt(u64::max_value()), 10);
    }
 }
--- a/src/common/timer.rs
+++ b/src/common/timer.rs
@@ -0,0 +1,99 @@
+use time::PreciseTime;
+
+pub struct OpenTimer<'a> {
+    name: &'static str,
+    timer_tree: &'a mut TimerTree,
+    start: PreciseTime,
+    depth: u32,
+}
+
+impl<'a> OpenTimer<'a> {
+    /// Starts timing a new named subtask
+    ///
+    /// The timer is stopped automatically
+    /// when the `OpenTimer` is dropped.
+    pub fn open(&mut self, name: &'static str) -> OpenTimer {
+        OpenTimer {
+            name: name,
+            timer_tree: self.timer_tree,
+            start: PreciseTime::now(),
+            depth: self.depth + 1,
+        }
+    }
+}
+
+impl<'a> Drop for OpenTimer<'a> {
+    fn drop(&mut self) {
+        self.timer_tree.timings.push(Timing {
+            name: self.name,
+            duration: self.start
+                .to(PreciseTime::now())
+                .num_microseconds()
+                .unwrap(),
+            depth: self.depth,
+        });
+    }
+}
+
+/// Timing recording
+#[derive(Debug, Serialize)]
+pub struct Timing {
+    name: &'static str,
+    duration: i64,
+    depth: u32,
+}
+
+/// Timer tree
+#[derive(Debug, Serialize)]
+pub struct TimerTree {
+    timings: Vec<Timing>,
+}
+
+impl TimerTree {
+    /// Returns the total time elapsed in microseconds
+    pub fn total_time(&self) -> i64 {
+        self.timings.last().unwrap().duration
+    }
+
+    /// Open a new named subtask
+    pub fn open(&mut self, name: &'static str) -> OpenTimer {
+        OpenTimer {
+            name: name,
+            timer_tree: self,
+            start: PreciseTime::now(),
+            depth: 0,
+        }
+    }
+}
+
+impl Default for TimerTree {
+    fn default() -> TimerTree {
+        TimerTree {
+            timings: Vec::new(),
+        }
+    }
+}
+
+#[cfg(test)]
+mod tests {
+
+    use super::*;
+
+    #[test]
+    fn test_timer() {
+        let mut timer_tree = TimerTree::default();
+        {
+            let mut a = timer_tree.open("a");
+            {
+                let mut ab = a.open("b");
+                {
+                    let _abc = ab.open("c");
+                }
+                {
+                    let _abd = ab.open("d");
+                }
+            }
+        }
+        assert_eq!(timer_tree.timings.len(), 4);
+    }
+}
--- a/src/common/vint.rs
+++ b/src/common/vint.rs
@@ -11,10 +11,6 @@ impl VInt {
    pub fn val(&self) -> u64 {
        self.0
    }
-
-    pub fn deserialize_u64<R: Read>(reader: &mut R) -> io::Result<u64> {
-        VInt::deserialize(reader).map(|vint| vint.0)
-    }
 }

 impl BinarySerializable for VInt {
--- a/src/compression/mod.rs
+++ b/src/compression/mod.rs
@@ -3,97 +3,39 @@

 mod stream;

-pub const COMPRESSION_BLOCK_SIZE: usize = 128;
-const COMPRESSED_BLOCK_MAX_SIZE: usize = COMPRESSION_BLOCK_SIZE * 4 + 1;
-
 pub use self::stream::CompressedIntStream;

+#[cfg(not(feature = "simdcompression"))]
+mod pack {
+    mod compression_pack_nosimd;
+    pub use self::compression_pack_nosimd::{BlockDecoder, BlockEncoder};
+}

-use bitpacking::{BitPacker, BitPacker4x};
+#[cfg(feature = "simdcompression")]
+mod pack {
+    mod compression_pack_simd;
+    pub use self::compression_pack_simd::{BlockDecoder, BlockEncoder};
+}

+pub use self::pack::{BlockDecoder, BlockEncoder};
+
+#[cfg(any(not(feature = "simdcompression"), target_env = "msvc"))]
+mod vint {
+    mod compression_vint_nosimd;
+    pub(crate) use self::compression_vint_nosimd::*;
+}
+
+#[cfg(all(feature = "simdcompression", not(target_env = "msvc")))]
+mod vint {
+    mod compression_vint_simd;
+    pub(crate) use self::compression_vint_simd::*;
+}

 /// Returns the size in bytes of a compressed block, given `num_bits`.
 pub fn compressed_block_size(num_bits: u8) -> usize {
-    1 + (num_bits as usize) * COMPRESSION_BLOCK_SIZE / 8
+    1 + (num_bits as usize) * 16
 }

-pub struct BlockEncoder {
-    bitpacker: BitPacker4x,
-    pub output: [u8; COMPRESSED_BLOCK_MAX_SIZE],
-    pub output_len: usize,
-}
-
-impl BlockEncoder {
-    pub fn new() -> BlockEncoder {
-        BlockEncoder {
-            bitpacker: BitPacker4x::new(),
-            output: [0u8; COMPRESSED_BLOCK_MAX_SIZE],
-            output_len: 0,
-        }
-    }
-
-    pub fn compress_block_sorted(&mut self, block: &[u32], offset: u32) -> &[u8] {
-        let num_bits = self.bitpacker.num_bits_sorted(offset, block);
-        self.output[0] = num_bits;
-        let written_size = 1 + self.bitpacker.compress_sorted(offset, block, &mut self.output[1..], num_bits);
-        &self.output[..written_size]
-    }
-
-    pub fn compress_block_unsorted(&mut self, block: &[u32]) -> &[u8] {
-        let num_bits = self.bitpacker.num_bits(block);
-        self.output[0] = num_bits;
-        let written_size = 1 + self.bitpacker.compress(block, &mut self.output[1..], num_bits);
-        &self.output[..written_size]
-    }
-}
-
-
-pub struct BlockDecoder {
-    bitpacker: BitPacker4x,
-    pub output: [u32; COMPRESSION_BLOCK_SIZE + 1],
-    pub output_len: usize,
-}
-
-impl BlockDecoder {
-    pub fn new() -> BlockDecoder {
-        BlockDecoder::with_val(0u32)
-    }
-
-    pub fn with_val(val: u32) -> BlockDecoder {
-        let mut output = [val; COMPRESSION_BLOCK_SIZE + 1];
-        output[COMPRESSION_BLOCK_SIZE] = 0u32;
-        BlockDecoder {
-            bitpacker: BitPacker4x::new(),
-            output,
-            output_len: 0,
-        }
-    }
-    
-    pub fn uncompress_block_sorted(&mut self, compressed_data: &[u8], offset: u32) -> usize {
-        let num_bits = compressed_data[0];
-        self.output_len = COMPRESSION_BLOCK_SIZE;
-        1 + self.bitpacker.decompress_sorted(offset, &compressed_data[1..], &mut self.output, num_bits)
-    }
-
-    pub fn uncompress_block_unsorted<'a>(&mut self, compressed_data: &'a [u8]) -> usize {
-        let num_bits = compressed_data[0];
-        self.output_len = COMPRESSION_BLOCK_SIZE;
-        1 + self.bitpacker.decompress(&compressed_data[1..], &mut self.output, num_bits)
-    }
-
-    #[inline]
-    pub fn output_array(&self) -> &[u32] {
-        &self.output[..self.output_len]
-    }
-
-    #[inline]
-    pub fn output(&self, idx: usize) -> u32 {
-        self.output[idx]
-    }
-}
-
-mod vint;
-
 pub trait VIntEncoder {
    /// Compresses an array of `u32` integers,
    /// using [delta-encoding](https://en.wikipedia.org/wiki/Delta_encoding)
@@ -170,6 +112,8 @@ impl VIntDecoder for BlockDecoder {
    }
 }

+pub const COMPRESSION_BLOCK_SIZE: usize = 128;
+
 #[cfg(test)]
 pub mod tests {

--- a/src/compression/pack/compression_pack_nosimd.rs
+++ b/src/compression/pack/compression_pack_nosimd.rs
@@ -0,0 +1,142 @@
+use common::bitpacker::compute_num_bits;
+use common::bitpacker::{BitPacker, BitUnpacker};
+use common::CountingWriter;
+use std::cmp;
+use std::io::Write;
+use super::super::COMPRESSION_BLOCK_SIZE;
+
+const COMPRESSED_BLOCK_MAX_SIZE: usize = COMPRESSION_BLOCK_SIZE * 4 + 1;
+
+pub fn compress_sorted(vals: &mut [u32], output: &mut [u8], offset: u32) -> usize {
+    let mut max_delta = 0;
+    {
+        let mut local_offset = offset;
+        for i in 0..COMPRESSION_BLOCK_SIZE {
+            let val = vals[i];
+            let delta = val - local_offset;
+            max_delta = cmp::max(max_delta, delta);
+            vals[i] = delta;
+            local_offset = val;
+        }
+    }
+    let mut counting_writer = CountingWriter::wrap(output);
+    let num_bits = compute_num_bits(max_delta as u64);
+    counting_writer.write_all(&[num_bits]).unwrap();
+
+    let mut bit_packer = BitPacker::new(num_bits as usize);
+    for val in vals {
+        bit_packer.write(*val as u64, &mut counting_writer).unwrap();
+    }
+    counting_writer.written_bytes()
+}
+
+pub struct BlockEncoder {
+    pub output: [u8; COMPRESSED_BLOCK_MAX_SIZE],
+    pub output_len: usize,
+    input_buffer: [u32; COMPRESSION_BLOCK_SIZE],
+}
+
+impl BlockEncoder {
+    pub fn new() -> BlockEncoder {
+        BlockEncoder {
+            output: [0u8; COMPRESSED_BLOCK_MAX_SIZE],
+            output_len: 0,
+            input_buffer: [0u32; COMPRESSION_BLOCK_SIZE],
+        }
+    }
+
+    pub fn compress_block_sorted(&mut self, vals: &[u32], offset: u32) -> &[u8] {
+        self.input_buffer.clone_from_slice(vals);
+        let compressed_size = compress_sorted(&mut self.input_buffer, &mut self.output, offset);
+        &self.output[..compressed_size]
+    }
+
+    pub fn compress_block_unsorted(&mut self, vals: &[u32]) -> &[u8] {
+        let compressed_size = {
+            let output: &mut [u8] = &mut self.output;
+            let max = vals.iter()
+                .cloned()
+                .max()
+                .expect("compress unsorted called with an empty array");
+            let num_bits = compute_num_bits(max as u64);
+            let mut counting_writer = CountingWriter::wrap(output);
+            counting_writer.write_all(&[num_bits]).unwrap();
+            let mut bit_packer = BitPacker::new(num_bits as usize);
+            for val in vals {
+                bit_packer.write(*val as u64, &mut counting_writer).unwrap();
+            }
+            for _ in vals.len()..COMPRESSION_BLOCK_SIZE {
+                bit_packer
+                    .write(vals[0] as u64, &mut counting_writer)
+                    .unwrap();
+            }
+            bit_packer.flush(&mut counting_writer).expect(
+                "Flushing the bitpacking \
+                 in an in RAM buffer should never fail",
+            );
+            // we avoid writing "closing", because we
+            // do not want 7 bytes of padding here.
+            counting_writer.written_bytes()
+        };
+        &self.output[..compressed_size]
+    }
+}
+
+pub struct BlockDecoder {
+    pub output: [u32; COMPRESSED_BLOCK_MAX_SIZE],
+    pub output_len: usize,
+}
+
+impl BlockDecoder {
+    pub fn new() -> BlockDecoder {
+        BlockDecoder::with_val(0u32)
+    }
+
+    pub fn with_val(val: u32) -> BlockDecoder {
+        BlockDecoder {
+            output: [val; COMPRESSED_BLOCK_MAX_SIZE],
+            output_len: 0,
+        }
+    }
+
+    pub fn uncompress_block_sorted<'a>(
+        &mut self,
+        compressed_data: &'a [u8],
+        mut offset: u32,
+    ) -> usize {
+        let consumed_size = {
+            let num_bits = compressed_data[0];
+            let bit_unpacker = BitUnpacker::new(&compressed_data[1..], num_bits as usize);
+            for i in 0..COMPRESSION_BLOCK_SIZE {
+                let delta = bit_unpacker.get(i);
+                let val = offset + delta as u32;
+                self.output[i] = val;
+                offset = val;
+            }
+            1 + (num_bits as usize * COMPRESSION_BLOCK_SIZE + 7) / 8
+        };
+        self.output_len = COMPRESSION_BLOCK_SIZE;
+        consumed_size
+    }
+
+    pub fn uncompress_block_unsorted<'a>(&mut self, compressed_data: &'a [u8]) -> usize {
+        let num_bits = compressed_data[0];
+        let bit_unpacker = BitUnpacker::new(&compressed_data[1..], num_bits as usize);
+        for i in 0..COMPRESSION_BLOCK_SIZE {
+            self.output[i] = bit_unpacker.get(i) as u32;
+        }
+        let consumed_size = 1 + (num_bits as usize * COMPRESSION_BLOCK_SIZE + 7) / 8;
+        self.output_len = COMPRESSION_BLOCK_SIZE;
+        consumed_size
+    }
+
+    #[inline]
+    pub fn output_array(&self) -> &[u32] {
+        &self.output[..self.output_len]
+    }
+
+    #[inline]
+    pub fn output(&self, idx: usize) -> u32 {
+        self.output[idx]
+    }
+}
--- a/src/compression/pack/compression_pack_simd.rs
+++ b/src/compression/pack/compression_pack_simd.rs
@@ -0,0 +1,118 @@
+use super::super::COMPRESSION_BLOCK_SIZE;
+
+const COMPRESSED_BLOCK_MAX_SIZE: usize = COMPRESSION_BLOCK_SIZE * 4 + 1;
+
+mod simdcomp {
+    use libc::size_t;
+
+    extern "C" {
+        pub fn compress_sorted(data: *const u32, output: *mut u8, offset: u32) -> size_t;
+
+        pub fn uncompress_sorted(
+            compressed_data: *const u8,
+            output: *mut u32,
+            offset: u32,
+        ) -> size_t;
+
+        pub fn compress_unsorted(data: *const u32, output: *mut u8) -> size_t;
+
+        pub fn uncompress_unsorted(compressed_data: *const u8, output: *mut u32) -> size_t;
+    }
+}
+
+fn compress_sorted(vals: &[u32], output: &mut [u8], offset: u32) -> usize {
+    unsafe { simdcomp::compress_sorted(vals.as_ptr(), output.as_mut_ptr(), offset) }
+}
+
+fn uncompress_sorted(compressed_data: &[u8], output: &mut [u32], offset: u32) -> usize {
+    unsafe {
+        simdcomp::uncompress_sorted(compressed_data.as_ptr(), output.as_mut_ptr(), offset)
+    }
+}
+
+fn compress_unsorted(vals: &[u32], output: &mut [u8]) -> usize {
+    unsafe { simdcomp::compress_unsorted(vals.as_ptr(), output.as_mut_ptr()) }
+}
+
+fn uncompress_unsorted(compressed_data: &[u8], output: &mut [u32]) -> usize {
+    unsafe { simdcomp::uncompress_unsorted(compressed_data.as_ptr(), output.as_mut_ptr()) }
+}
+
+pub struct BlockEncoder {
+    pub output: [u8; COMPRESSED_BLOCK_MAX_SIZE],
+    pub output_len: usize,
+}
+
+impl BlockEncoder {
+    pub fn new() -> BlockEncoder {
+        BlockEncoder {
+            output: [0u8; COMPRESSED_BLOCK_MAX_SIZE],
+            output_len: 0,
+        }
+    }
+
+    pub fn compress_block_sorted(&mut self, vals: &[u32], offset: u32) -> &[u8] {
+        let compressed_size = compress_sorted(vals, &mut self.output, offset);
+        &self.output[..compressed_size]
+    }
+
+    pub fn compress_block_unsorted(&mut self, vals: &[u32]) -> &[u8] {
+        let compressed_size = compress_unsorted(vals, &mut self.output);
+        &self.output[..compressed_size]
+    }
+}
+
+pub struct BlockDecoder {
+    pub output: [u32; COMPRESSED_BLOCK_MAX_SIZE],
+    pub output_len: usize,
+}
+
+impl BlockDecoder {
+    pub fn new() -> BlockDecoder {
+        BlockDecoder::with_val(0u32)
+    }
+
+    pub fn with_val(val: u32) -> BlockDecoder {
+        BlockDecoder {
+            output: [val; COMPRESSED_BLOCK_MAX_SIZE],
+            output_len: 0,
+        }
+    }
+
+    pub fn uncompress_block_sorted(&mut self, compressed_data: &[u8], offset: u32) -> usize {
+        let consumed_size = uncompress_sorted(compressed_data, &mut self.output, offset);
+        self.output_len = COMPRESSION_BLOCK_SIZE;
+        consumed_size
+    }
+
+    pub fn uncompress_block_unsorted<'a>(&mut self, compressed_data: &'a [u8]) -> usize {
+        let consumed_size = uncompress_unsorted(compressed_data, &mut self.output);
+        self.output_len = COMPRESSION_BLOCK_SIZE;
+        consumed_size
+    }
+
+    #[inline]
+    pub fn output_array(&self) -> &[u32] {
+        &self.output[..self.output_len]
+    }
+
+    #[inline]
+    pub fn output(&self, idx: usize) -> u32 {
+        self.output[idx]
+    }
+}
+
+#[cfg(test)]
+mod tests {
+
+    use super::BlockEncoder;
+
+    #[test]
+    fn test_all_docs_compression_len() {
+        let data: Vec<u32> = (0u32..128u32).collect();
+        let mut encoder = BlockEncoder::new();
+        let compressed = encoder.compress_block_sorted(&data, 0u32);
+        assert_eq!(compressed.len(), 17);
+    }
+
+}
--- a/src/compression/stream.rs
+++ b/src/compression/stream.rs
@@ -11,12 +11,7 @@ use directory::{ReadOnlySource, SourceRead};
 /// decompressing blocks that are not required.
 pub struct CompressedIntStream {
    buffer: SourceRead,
-
    block_decoder: BlockDecoder,
-    cached_addr: usize, // address of the currently decoded block
-    cached_next_addr: usize, // address following the currently decoded block
-
-    addr: usize, // address of the block associated to the current position
    inner_offset: usize,
 }

@@ -26,47 +21,34 @@ impl CompressedIntStream {
        CompressedIntStream {
            buffer: SourceRead::from(source),
            block_decoder: BlockDecoder::new(),
-            cached_addr: usize::max_value(),
-            cached_next_addr: usize::max_value(),
-
-            addr: 0,
-            inner_offset: 0,
+            inner_offset: COMPRESSION_BLOCK_SIZE,
        }
    }

-    /// Loads the block at the given address and return the address of the
-    /// following block
-    pub fn read_block(&mut self, addr: usize) -> usize {
-        if self.cached_addr == addr {
-            // we are already on this block.
-            // no need to read.
-            self.cached_next_addr
-        } else {
-            let next_addr = addr + self.block_decoder.uncompress_block_unsorted(self.buffer.slice_from(addr));
-            self.cached_addr = addr;
-            self.cached_next_addr = next_addr;
-            next_addr
-        }
-    }
-
-    /// Fills a buffer with the next `output.len()` integers.
-    /// This does not consume / advance the stream.
+    /// Fills a buffer with the next `output.len()` integers,
+    /// and advance the stream by that many els.
    pub fn read(&mut self, output: &mut [u32]) {
-        let mut cursor = self.addr;
-        let mut inner_offset = self.inner_offset;
        let mut num_els: usize = output.len();
-        let mut start = 0;
+        let mut start: usize = 0;
        loop {
-            cursor = self.read_block(cursor);
-            let block = &self.block_decoder.output_array()[inner_offset..];
-            let block_len = block.len();
-            if num_els >= block_len {
-                output[start..start + block_len].clone_from_slice(&block);
-                start += block_len;
-                num_els -= block_len;
-                inner_offset = 0;
+            let available = COMPRESSION_BLOCK_SIZE - self.inner_offset;
+            if num_els >= available {
+                if available > 0 {
+                    let uncompressed_block =
+                        &self.block_decoder.output_array()[self.inner_offset..];
+                    output[start..][..available].clone_from_slice(uncompressed_block);
+                }
+                num_els -= available;
+                start += available;
+                let num_consumed_bytes = self.block_decoder
+                    .uncompress_block_unsorted(self.buffer.as_ref());
+                self.buffer.advance(num_consumed_bytes);
+                self.inner_offset = 0;
            } else {
-                output[start..].clone_from_slice(&block[..num_els]);
+                let uncompressed_block = &self.block_decoder.output_array()
+                    [self.inner_offset..self.inner_offset + num_els];
+                output[start..][..num_els].clone_from_slice(uncompressed_block);
+                self.inner_offset += num_els;
                break;
            }
        }
@@ -76,22 +58,23 @@ impl CompressedIntStream {
    ///
    /// If a full block is skipped, calling
    /// `.skip(...)` will avoid decompressing it.
-    ///
-    /// May panic if the end of the stream is reached.
    pub fn skip(&mut self, mut skip_len: usize) {
-        loop {
-            let available = COMPRESSION_BLOCK_SIZE - self.inner_offset;
-            if available >= skip_len {
-                self.inner_offset += skip_len;
-                break;
-            } else {
-                skip_len -= available;
-                // entirely skip decompressing some blocks.
-                let num_bits: u8 = self.buffer.get(self.addr);
+        let available = COMPRESSION_BLOCK_SIZE - self.inner_offset;
+        if available >= skip_len {
+            self.inner_offset += skip_len;
+        } else {
+            skip_len -= available;
+            // entirely skip decompressing some blocks.
+            while skip_len >= COMPRESSION_BLOCK_SIZE {
+                skip_len -= COMPRESSION_BLOCK_SIZE;
+                let num_bits: u8 = self.buffer.as_ref()[0];
                let block_len = compressed_block_size(num_bits);
-                self.addr += block_len;
-                self.inner_offset = 0;
+                self.buffer.advance(block_len);
            }
+            let num_consumed_bytes = self.block_decoder
+                .uncompress_block_unsorted(self.buffer.as_ref());
+            self.buffer.advance(num_consumed_bytes);
+            self.inner_offset = skip_len;
        }
    }
 }
@@ -108,7 +91,7 @@ pub mod tests {
    fn create_stream_buffer() -> ReadOnlySource {
        let mut buffer: Vec<u8> = vec![];
        let mut encoder = BlockEncoder::new();
-        let vals: Vec<u32> = (0u32..1152u32).collect();
+        let vals: Vec<u32> = (0u32..1_025u32).collect();
        for chunk in vals.chunks(COMPRESSION_BLOCK_SIZE) {
            let compressed_block = encoder.compress_block_unsorted(chunk);
            let num_bits = compressed_block[0];
@@ -130,24 +113,13 @@ pub mod tests {
        stream.read(&mut block[0..2]);
        assert_eq!(block[0], 0);
        assert_eq!(block[1], 1);
-
-        // reading does not consume the stream
-        stream.read(&mut block[0..2]);
-        assert_eq!(block[0], 0);
-        assert_eq!(block[1], 1);
-        stream.skip(2);
-
        stream.skip(5);
        stream.read(&mut block[0..3]);
-        stream.skip(3);
-
        assert_eq!(block[0], 7);
        assert_eq!(block[1], 8);
        assert_eq!(block[2], 9);
        stream.skip(500);
        stream.read(&mut block[0..3]);
-        stream.skip(3);
-
        assert_eq!(block[0], 510);
        assert_eq!(block[1], 511);
        assert_eq!(block[2], 512);
--- a/src/compression/vint/compression_vint_nosimd.rs
+++ b/src/compression/vint/compression_vint_nosimd.rs
--- a/src/compression/vint/compression_vint_simd.rs
+++ b/src/compression/vint/compression_vint_simd.rs
@@ -0,0 +1,72 @@
+mod streamvbyte {
+
+    use libc::size_t;
+
+    extern "C" {
+        pub fn streamvbyte_delta_encode(
+            data: *const u32,
+            num_els: u32,
+            output: *mut u8,
+            offset: u32,
+        ) -> size_t;
+
+        pub fn streamvbyte_delta_decode(
+            compressed_data: *const u8,
+            output: *mut u32,
+            num_els: u32,
+            offset: u32,
+        ) -> size_t;
+
+        pub fn streamvbyte_encode(data: *const u32, num_els: u32, output: *mut u8) -> size_t;
+
+        pub fn streamvbyte_decode(
+            compressed_data: *const u8,
+            output: *mut u32,
+            num_els: usize,
+        ) -> size_t;
+    }
+}
+
+#[inline(always)]
+pub(crate) fn compress_sorted<'a>(input: &[u32], output: &'a mut [u8], offset: u32) -> &'a [u8] {
+    let compress_length = unsafe {
+        streamvbyte::streamvbyte_delta_encode(
+            input.as_ptr(),
+            input.len() as u32,
+            output.as_mut_ptr(),
+            offset,
+        )
+    };
+    &output[..compress_length]
+}
+
+#[inline(always)]
+pub(crate) fn compress_unsorted<'a>(input: &[u32], output: &'a mut [u8]) -> &'a [u8] {
+    let compress_length = unsafe {
+        streamvbyte::streamvbyte_encode(input.as_ptr(), input.len() as u32, output.as_mut_ptr())
+    };
+    &output[..compress_length]
+}
+
+#[inline(always)]
+pub(crate) fn uncompress_sorted<'a>(
+    compressed_data: &'a [u8],
+    output: &mut [u32],
+    offset: u32,
+) -> usize {
+    unsafe {
+        streamvbyte::streamvbyte_delta_decode(
+            compressed_data.as_ptr(),
+            output.as_mut_ptr(),
+            output.len() as u32,
+            offset,
+        )
+    }
+}
+
+#[inline(always)]
+pub(crate) fn uncompress_unsorted<'a>(compressed_data: &'a [u8], output: &mut [u32]) -> usize {
+    unsafe {
+        streamvbyte::streamvbyte_decode(compressed_data.as_ptr(), output.as_mut_ptr(), output.len())
+    }
+}
--- a/src/core/index.rs
+++ b/src/core/index.rs
@@ -6,11 +6,7 @@ use std::sync::Arc;
 use std::borrow::BorrowMut;
 use std::fmt;
 use core::SegmentId;
-
-
-#[cfg(feature="mmap")]
-use directory::MmapDirectory;
-use directory::{Directory, RAMDirectory};
+use directory::{Directory, MmapDirectory, RAMDirectory};
 use indexer::index_writer::open_index_writer;
 use core::searcher::Searcher;
 use std::convert::From;
@@ -22,7 +18,6 @@ use core::SegmentMeta;
 use super::pool::LeasedItem;
 use std::path::Path;
 use core::IndexMeta;
-use indexer::DirectoryLock;
 use IndexWriter;
 use directory::ManagedDirectory;
 use core::META_FILEPATH;
@@ -65,7 +60,6 @@ impl Index {
    /// The index will use the `MMapDirectory`.
    ///
    /// If a previous index was in this directory, then its meta file will be destroyed.
-    #[cfg(feature="mmap")]
    pub fn create<P: AsRef<Path>>(directory_path: P, schema: Schema) -> Result<Index> {
        let mmap_directory = MmapDirectory::open(directory_path)?;
        let directory = ManagedDirectory::new(mmap_directory)?;
@@ -85,8 +79,6 @@ impl Index {
    ///
    /// The temp directory is only used for testing the `MmapDirectory`.
    /// For other unit tests, prefer the `RAMDirectory`, see: `create_in_ram`.
-    #[cfg(feature="mmap")]
-    #[cfg(test)]
    pub fn create_from_tempdir(schema: Schema) -> Result<Index> {
        let mmap_directory = MmapDirectory::create_from_tempdir()?;
        let directory = ManagedDirectory::new(mmap_directory)?;
@@ -114,7 +106,6 @@ impl Index {
    }

    /// Opens a new directory from an index path.
-    #[cfg(feature="mmap")]
    pub fn open<P: AsRef<Path>>(directory_path: P) -> Result<Index> {
        let mmap_directory = MmapDirectory::open(directory_path)?;
        let directory = ManagedDirectory::new(mmap_directory)?;
@@ -122,16 +113,12 @@ impl Index {
        Index::create_from_metas(directory, &metas)
    }

-    pub fn open_directory<TDirectory: Directory>(directory: TDirectory) -> Result<Index> {
-        let directory = ManagedDirectory::new(directory)?;
-        let metas = load_metas(&directory)?;
-        Index::create_from_metas(directory, &metas)
-    }
-
-
-    /// Reads the index meta file from the directory.
-    pub fn load_metas(&self) -> Result<IndexMeta> {
-        load_metas(self.directory())
+    /// Returns the index opstamp.
+    ///
+    /// The opstamp is the number of documents that have been added
+    /// from the beginning of time, and until the moment of the last commit.
+    pub fn opstamp(&self) -> u64 {
+        load_metas(self.directory()).unwrap().opstamp
    }

    /// Open a new index writer. Attempts to acquire a lockfile.
@@ -154,8 +141,7 @@ impl Index {
        num_threads: usize,
        heap_size_in_bytes: usize,
    ) -> Result<IndexWriter> {
-        let directory_lock = DirectoryLock::lock(self.directory().box_clone())?;
-        open_index_writer(self, num_threads, heap_size_in_bytes, directory_lock)
+        open_index_writer(self, num_threads, heap_size_in_bytes)
    }

    /// Creates a multithreaded writer
@@ -208,7 +194,7 @@ impl Index {
    /// Reads the meta.json and returns the list of
    /// `SegmentMeta` from the last commit.
    pub fn searchable_segment_metas(&self) -> Result<Vec<SegmentMeta>> {
-        Ok(self.load_metas()?.segments)
+        Ok(load_metas(self.directory())?.segments)
    }

    /// Returns the list of segment ids that are searchable.
--- a/src/core/index_meta.rs
+++ b/src/core/index_meta.rs
@@ -1,7 +1,5 @@
 use schema::Schema;
 use core::SegmentMeta;
-use std::fmt;
-use serde_json;

 /// Meta information about the `Index`.
 ///
@@ -11,13 +9,11 @@ use serde_json;
 /// * the index `docstamp`
 /// * the schema
 ///
-#[derive(Clone, Serialize, Deserialize)]
+#[derive(Clone, Debug, Serialize, Deserialize)]
 pub struct IndexMeta {
    pub segments: Vec<SegmentMeta>,
    pub schema: Schema,
    pub opstamp: u64,
-    #[serde(skip_serializing_if = "Option::is_none")]
-    pub payload: Option<String>,
 }

 impl IndexMeta {
@@ -26,43 +22,6 @@ impl IndexMeta {
            segments: vec![],
            schema,
            opstamp: 0u64,
-            payload: None,
        }
    }
 }
-
-impl fmt::Debug for IndexMeta {
-    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
-        write!(
-            f,
-            "{}",
-            serde_json::ser::to_string(self)
-                .expect("JSON serialization for IndexMeta should never fail.")
-        )
-    }
-}
-
-#[cfg(test)]
-mod tests {
-
-    use serde_json;
-    use super::IndexMeta;
-    use schema::{SchemaBuilder, TEXT};
-
-    #[test]
-    fn test_serialize_metas() {
-        let schema = {
-            let mut schema_builder = SchemaBuilder::new();
-            schema_builder.add_text_field("text", TEXT);
-            schema_builder.build()
-        };
-        let index_metas = IndexMeta {
-            segments: Vec::new(),
-            schema: schema,
-            opstamp: 0u64,
-            payload: None,
-        };
-        let json = serde_json::ser::to_string(&index_metas).expect("serialization failed");
-        assert_eq!(json, r#"{"segments":[],"schema":[{"name":"text","type":"text","options":{"indexing":{"record":"position","tokenizer":"default"},"stored":false}}],"opstamp":0}"#);
-    }
-}
--- a/src/core/inverted_index_reader.rs
+++ b/src/core/inverted_index_reader.rs
@@ -4,10 +4,10 @@ use postings::{BlockSegmentPostings, SegmentPostings};
 use postings::TermInfo;
 use schema::IndexRecordOption;
 use schema::Term;
+use std::cmp;
+use fastfield::DeleteBitSet;
+use schema::Schema;
 use compression::CompressedIntStream;
-use postings::FreqReadingOption;
-use common::BinarySerializable;
-use schema::FieldType;

 /// The inverted index reader is in charge of accessing
 /// the inverted index associated to a specific field.
@@ -26,47 +26,30 @@ pub struct InvertedIndexReader {
    termdict: TermDictionaryImpl,
    postings_source: ReadOnlySource,
    positions_source: ReadOnlySource,
-    record_option: IndexRecordOption,
-    total_num_tokens: u64
+    delete_bitset: DeleteBitSet,
+    schema: Schema,
 }

 impl InvertedIndexReader {
    pub(crate) fn new(
-        termdict: TermDictionaryImpl,
+        termdict_source: ReadOnlySource,
        postings_source: ReadOnlySource,
        positions_source: ReadOnlySource,
-        record_option: IndexRecordOption,
+        delete_bitset: DeleteBitSet,
+        schema: Schema,
    ) -> InvertedIndexReader {
-        let total_num_tokens_data = postings_source.slice(0, 8);
-        let mut total_num_tokens_cursor = total_num_tokens_data.as_slice();
-        let total_num_tokens = u64::deserialize(&mut total_num_tokens_cursor).unwrap_or(0u64);
        InvertedIndexReader {
-            termdict,
-            postings_source: postings_source.slice_from(8),
+            termdict: TermDictionaryImpl::from_source(termdict_source),
+            postings_source,
            positions_source,
-            record_option,
-            total_num_tokens
-        }
-    }
-
-    /// Creates an empty `InvertedIndexReader` object, which
-    /// contains no terms at all.
-    pub fn empty(field_type: FieldType) -> InvertedIndexReader {
-        let record_option = field_type
-            .get_index_record_option()
-            .unwrap_or(IndexRecordOption::Basic);
-        InvertedIndexReader {
-            termdict:    TermDictionaryImpl::empty(field_type),
-            postings_source: ReadOnlySource::empty(),
-            positions_source: ReadOnlySource::empty(),
-            record_option,
-            total_num_tokens: 0u64
+            delete_bitset,
+            schema,
        }
    }

    /// Returns the term info associated with the term.
    pub fn get_term_info(&self, term: &Term) -> Option<TermInfo> {
-        self.termdict.get(term.value_bytes())
+        self.termdict.get(term.as_slice())
    }

    /// Return the term dictionary datastructure.
@@ -103,19 +86,15 @@ impl InvertedIndexReader {
    pub fn read_block_postings_from_terminfo(
        &self,
        term_info: &TermInfo,
-        requested_option: IndexRecordOption,
+        option: IndexRecordOption,
    ) -> BlockSegmentPostings {
        let offset = term_info.postings_offset as usize;
        let postings_data = self.postings_source.slice_from(offset);
-        let freq_reading_option = match (self.record_option, requested_option) {
-            (IndexRecordOption::Basic, _) => FreqReadingOption::NoFreq,
-            (_, IndexRecordOption::Basic) => FreqReadingOption::SkipFreq,
-            (_, _) => FreqReadingOption::ReadFreq,
-        };
+        let has_freq = option.has_freq();
        BlockSegmentPostings::from_data(
            term_info.doc_freq as usize,
            SourceRead::from(postings_data),
-            freq_reading_option,
+            has_freq,
        )
    }

@@ -129,6 +108,7 @@ impl InvertedIndexReader {
        option: IndexRecordOption,
    ) -> SegmentPostings {
        let block_postings = self.read_block_postings_from_terminfo(term_info, option);
+        let delete_bitset = self.delete_bitset.clone();
        let position_stream = {
            if option.has_positions() {
                let position_offset = term_info.positions_offset;
@@ -140,17 +120,9 @@ impl InvertedIndexReader {
                None
            }
        };
-        SegmentPostings::from_block_postings(block_postings, position_stream)
+        SegmentPostings::from_block_postings(block_postings, delete_bitset, position_stream)
    }

-    /// Returns the total number of tokens recorded for all documents
-    /// (including deleted documents).
-    pub fn total_num_tokens(&self) -> u64 {
-        self.total_num_tokens
-    }
-
-
-
    /// Returns the segment postings associated with the term, and with the given option,
    /// or `None` if the term has never been encountered and indexed.
    ///
@@ -162,23 +134,19 @@ impl InvertedIndexReader {
    /// `TextIndexingOptions` that does not index position will return a `SegmentPostings`
    /// with `DocId`s and frequencies.
    pub fn read_postings(&self, term: &Term, option: IndexRecordOption) -> Option<SegmentPostings> {
+        let field = term.field();
+        let field_entry = self.schema.get_field_entry(field);
        let term_info = get!(self.get_term_info(term));
-        Some(self.read_postings_from_terminfo(&term_info, option))
+        let maximum_option = get!(field_entry.field_type().get_index_record_option());
+        let best_effort_option = cmp::min(maximum_option, option);
+        Some(self.read_postings_from_terminfo(&term_info, best_effort_option))
    }

-    pub(crate) fn read_postings_no_deletes(&self, term: &Term, option: IndexRecordOption) -> Option<SegmentPostings> {
-        let term_info = get!(self.get_term_info(term));
-        Some(self.read_postings_from_terminfo(&term_info, option))
-    }
-
-
    /// Returns the number of documents containing the term.
    pub fn doc_freq(&self, term: &Term) -> u32 {
-        self.get_term_info(term)
-            .map(|term_info| term_info.doc_freq)
-            .unwrap_or(0u32)
+        match self.get_term_info(term) {
+            Some(term_info) => term_info.doc_freq,
+            None => 0,
+        }
    }
 }
-
-
-
--- a/src/core/searcher.rs
+++ b/src/core/searcher.rs
@@ -2,7 +2,9 @@ use Result;
 use core::SegmentReader;
 use schema::Document;
 use collector::Collector;
+use common::TimerTree;
 use query::Query;
+use DocId;
 use DocAddress;
 use schema::{Field, Term};
 use termdict::{TermDictionary, TermMerger};
@@ -31,20 +33,20 @@ impl Searcher {
    }

    /// Returns the overall number of documents in the index.
-    pub fn num_docs(&self) -> u64 {
+    pub fn num_docs(&self) -> DocId {
        self.segment_readers
            .iter()
-            .map(|segment_reader| segment_reader.num_docs() as u64)
-            .sum::<u64>()
+            .map(|segment_reader| segment_reader.num_docs())
+            .fold(0u32, |acc, val| acc + val)
    }

    /// Return the overall number of documents containing
    /// the given term.
-    pub fn doc_freq(&self, term: &Term) -> u64 {
+    pub fn doc_freq(&self, term: &Term) -> u32 {
        self.segment_readers
            .iter()
-            .map(|segment_reader| segment_reader.inverted_index(term.field()).doc_freq(term) as u64)
-            .sum::<u64>()
+            .map(|segment_reader| segment_reader.inverted_index(term.field()).doc_freq(term))
+            .fold(0u32, |acc, val| acc + val)
    }

    /// Return the list of segment readers
@@ -58,7 +60,7 @@ impl Searcher {
    }

    /// Runs a query on the segment readers wrapped by the searcher
-    pub fn search<C: Collector>(&self, query: &Query, collector: &mut C) -> Result<()> {
+    pub fn search<C: Collector>(&self, query: &Query, collector: &mut C) -> Result<TimerTree> {
        query.search(self, collector)
    }

--- a/src/core/segment_reader.rs
+++ b/src/core/segment_reader.rs
@@ -8,6 +8,7 @@ use core::SegmentMeta;
 use fastfield::{self, FastFieldNotAvailableError};
 use fastfield::DeleteBitSet;
 use store::StoreReader;
+use directory::ReadOnlySource;
 use schema::Document;
 use DocId;
 use std::sync::Arc;
@@ -16,16 +17,8 @@ use common::CompositeFile;
 use std::fmt;
 use core::InvertedIndexReader;
 use schema::Field;
-use schema::FieldType;
-use error::ErrorKind;
-use termdict::TermDictionaryImpl;
-use fastfield::FacetReader;
-use fastfield::FastFieldReader;
+use fastfield::{FastFieldReader, U64FastFieldReader};
 use schema::Schema;
-use termdict::TermDictionary;
-use fastfield::{FastValue, MultiValueIntFastFieldReader};
-use schema::Cardinality;
-use fieldnorm::FieldNormReader;

 /// Entry point to access all of the datastructures of the `Segment`
 ///
@@ -38,8 +31,6 @@ use fieldnorm::FieldNormReader;
 /// The segment reader has a very low memory footprint,
 /// as close to all of the memory data is mmapped.
 ///
-///
-/// TODO fix not decoding docfreq
 #[derive(Clone)]
 pub struct SegmentReader {
    inv_idx_reader_cache: Arc<RwLock<HashMap<Field, Arc<InvertedIndexReader>>>>,
@@ -54,7 +45,7 @@ pub struct SegmentReader {
    fieldnorms_composite: CompositeFile,

    store_reader: StoreReader,
-    delete_bitset_opt: Option<DeleteBitSet>,
+    delete_bitset: DeleteBitSet,
    schema: Schema,
 }

@@ -79,14 +70,7 @@ impl SegmentReader {
    /// Return the number of documents that have been
    /// deleted in the segment.
    pub fn num_deleted_docs(&self) -> DocId {
-        self.delete_bitset()
-            .map(|delete_set| delete_set.len() as DocId)
-            .unwrap_or(0u32)
-    }
-
-    /// Returns true iff some of the documents of the segment have been deleted.
-    pub fn has_deletes(&self) -> bool {
-        self.delete_bitset().is_some()
+        self.delete_bitset.len() as DocId
    }

    /// Accessor to a segment's fast field reader given a field.
@@ -99,69 +83,21 @@ impl SegmentReader {
    ///
    /// # Panics
    /// May panic if the index is corrupted.
-    pub fn fast_field_reader<Item: FastValue>(
+    pub fn get_fast_field_reader<TFastFieldReader: FastFieldReader>(
        &self,
        field: Field,
-    ) -> fastfield::Result<FastFieldReader<Item>> {
+    ) -> fastfield::Result<TFastFieldReader> {
        let field_entry = self.schema.get_field_entry(field);
-        if Item::fast_field_cardinality(field_entry.field_type()) == Some(Cardinality::SingleValue)
-            {
-                self.fast_fields_composite
-                    .open_read(field)
-                    .ok_or_else(|| FastFieldNotAvailableError::new(field_entry))
-                    .map(FastFieldReader::open)
-            } else {
+        if !TFastFieldReader::is_enabled(field_entry.field_type()) {
            Err(FastFieldNotAvailableError::new(field_entry))
+        } else {
+            self.fast_fields_composite
+                .open_read(field)
+                .ok_or_else(|| FastFieldNotAvailableError::new(field_entry))
+                .map(TFastFieldReader::open)
        }
    }

-    /// Accessor to the `MultiValueIntFastFieldReader` associated to a given `Field`.
-    /// May panick if the field is not a multivalued fastfield of the type `Item`.
-    pub fn multi_fast_field_reader<Item: FastValue>(
-        &self,
-        field: Field,
-    ) -> fastfield::Result<MultiValueIntFastFieldReader<Item>> {
-        let field_entry = self.schema.get_field_entry(field);
-        if Item::fast_field_cardinality(field_entry.field_type()) == Some(Cardinality::MultiValues)
-            {
-                let idx_reader = self.fast_fields_composite
-                    .open_read_with_idx(field, 0)
-                    .ok_or_else(|| FastFieldNotAvailableError::new(field_entry))
-                    .map(FastFieldReader::open)?;
-                let vals_reader = self.fast_fields_composite
-                    .open_read_with_idx(field, 1)
-                    .ok_or_else(|| FastFieldNotAvailableError::new(field_entry))
-                    .map(FastFieldReader::open)?;
-                Ok(MultiValueIntFastFieldReader::open(idx_reader, vals_reader))
-            } else {
-            Err(FastFieldNotAvailableError::new(field_entry))
-        }
-    }
-
-    /// Accessor to the `FacetReader` associated to a given `Field`.
-    pub fn facet_reader(&self, field: Field) -> Result<FacetReader> {
-        let field_entry = self.schema.get_field_entry(field);
-        if field_entry.field_type() != &FieldType::HierarchicalFacet {
-            return Err(ErrorKind::InvalidArgument(format!(
-                "The field {:?} is not a \
-                 hierarchical facet.",
-                field_entry
-            )).into());
-        }
-        let term_ords_reader = self.multi_fast_field_reader(field)?;
-        let termdict_source = self.termdict_composite.open_read(field).ok_or_else(|| {
-            ErrorKind::InvalidArgument(format!(
-                "The field \"{}\" is a hierarchical \
-                 but this segment does not seem to have the field term \
-                 dictionary.",
-                field_entry.name()
-            ))
-        })?;
-        let termdict = TermDictionaryImpl::from_source(termdict_source);
-        let facet_reader = FacetReader::new(term_ords_reader, termdict);
-        Ok(facet_reader)
-    }
-
    /// Accessor to the segment's `Field norms`'s reader.
    ///
    /// Field norms are the length (in tokens) of the fields.
@@ -170,15 +106,10 @@ impl SegmentReader {
    ///
    /// They are simply stored as a fast field, serialized in
    /// the `.fieldnorm` file of the segment.
-    pub fn get_fieldnorms_reader(&self, field: Field) -> FieldNormReader {
-        if let Some(fieldnorm_source) = self.fieldnorms_composite
-            .open_read(field) {
-            FieldNormReader::open(fieldnorm_source)
-        } else {
-            let field_name = self.schema.get_field_name(field);
-            let err_msg=  format!("Field norm not found for field {:?}. Was it market as indexed during indexing.", field_name);
-            panic!(err_msg);
-        }
+    pub fn get_fieldnorms_reader(&self, field: Field) -> Option<U64FastFieldReader> {
+        self.fieldnorms_composite
+            .open_read(field)
+            .map(U64FastFieldReader::open)
    }

    /// Accessor to the segment's `StoreReader`.
@@ -211,13 +142,12 @@ impl SegmentReader {
        let fieldnorms_data = segment.open_read(SegmentComponent::FIELDNORMS)?;
        let fieldnorms_composite = CompositeFile::open(&fieldnorms_data)?;

-        let delete_bitset_opt =
-            if segment.meta().has_deletes() {
-                let delete_data = segment.open_read(SegmentComponent::DELETE)?;
-                Some(DeleteBitSet::open(delete_data))
-            } else {
-                None
-            };
+        let delete_bitset = if segment.meta().has_deletes() {
+            let delete_data = segment.open_read(SegmentComponent::DELETE)?;
+            DeleteBitSet::open(delete_data)
+        } else {
+            DeleteBitSet::empty()
+        };

        let schema = segment.schema();
        Ok(SegmentReader {
@@ -229,15 +159,13 @@ impl SegmentReader {
            fieldnorms_composite,
            segment_id: segment.id(),
            store_reader,
-            delete_bitset_opt,
+            delete_bitset,
            positions_composite,
            schema,
        })
    }

    /// Returns a field reader associated to the field given in argument.
-    /// If the field was not present in the index during indexing time,
-    /// the InvertedIndexReader is empty.
    ///
    /// The field reader is in charge of iterating through the
    /// term dictionary associated to a specific field,
@@ -248,43 +176,27 @@ impl SegmentReader {
            .expect("Lock poisoned. This should never happen")
            .get(&field)
        {
-            return Arc::clone(inv_idx_reader);
-        }
-        let field_entry = self.schema.get_field_entry(field);
-        let field_type = field_entry.field_type();
-        let record_option_opt = field_type.get_index_record_option();
-
-        if record_option_opt.is_none() {
-            panic!("Field {:?} does not seem indexed.", field_entry.name());
+            Arc::clone(inv_idx_reader);
        }

-        let record_option = record_option_opt.unwrap();
-
-        let postings_source_opt = self.postings_composite.open_read(field);
-
-        if postings_source_opt.is_none() {
-            // no documents in the segment contained this field.
-            // As a result, no data is associated to the inverted index.
-            //
-            // Returns an empty inverted index.
-            return Arc::new(InvertedIndexReader::empty(field_type.clone()));
-        }
-
-        let postings_source = postings_source_opt.unwrap();
-
-        let termdict_source = self.termdict_composite
+        let termdict_source: ReadOnlySource = self.termdict_composite
            .open_read(field)
-            .expect("Failed to open field term dictionary in composite file. Is the field indexed");
+            .expect("Index corrupted. Failed to open field term dictionary in composite file.");
+
+        let postings_source = self.postings_composite
+            .open_read(field)
+            .expect("Index corrupted. Failed to open field postings in composite file.");

        let positions_source = self.positions_composite
            .open_read(field)
            .expect("Index corrupted. Failed to open field positions in composite file.");

        let inv_idx_reader = Arc::new(InvertedIndexReader::new(
-            TermDictionaryImpl::from_source(termdict_source),
+            termdict_source,
            postings_source,
            positions_source,
-            record_option,
+            self.delete_bitset.clone(),
+            self.schema.clone(),
        ));

        // by releasing the lock in between, we may end up opening the inverting index
@@ -312,16 +224,14 @@ impl SegmentReader {

    /// Returns the bitset representing
    /// the documents that have been deleted.
-    pub fn delete_bitset(&self) -> Option<&DeleteBitSet> {
-        self.delete_bitset_opt.as_ref()
+    pub fn delete_bitset(&self) -> &DeleteBitSet {
+        &self.delete_bitset
    }

    /// Returns true iff the `doc` is marked
    /// as deleted.
    pub fn is_deleted(&self, doc: DocId) -> bool {
-        self.delete_bitset()
-            .map(|delete_set| delete_set.is_deleted(doc))
-            .unwrap_or(false)
+        self.delete_bitset.is_deleted(doc)
    }
 }

--- a/src/datastruct/skip/mod.rs
+++ b/src/datastruct/skip/mod.rs
@@ -9,12 +9,12 @@ pub use self::skiplist::SkipList;
 #[cfg(test)]
 mod tests {

-    use super::{SkipList, SkipListBuilder};
+    use super::*;

    #[test]
    fn test_skiplist() {
        let mut output: Vec<u8> = Vec::new();
-        let mut skip_list_builder: SkipListBuilder<u32> = SkipListBuilder::new(8);
+        let mut skip_list_builder: SkipListBuilder<u32> = SkipListBuilder::new(10);
        skip_list_builder.insert(2, &3).unwrap();
        skip_list_builder.write::<Vec<u8>>(&mut output).unwrap();
        let mut skip_list: SkipList<u32> = SkipList::from(output.as_slice());
@@ -24,7 +24,7 @@ mod tests {
    #[test]
    fn test_skiplist2() {
        let mut output: Vec<u8> = Vec::new();
-        let skip_list_builder: SkipListBuilder<u32> = SkipListBuilder::new(8);
+        let skip_list_builder: SkipListBuilder<u32> = SkipListBuilder::new(10);
        skip_list_builder.write::<Vec<u8>>(&mut output).unwrap();
        let mut skip_list: SkipList<u32> = SkipList::from(output.as_slice());
        assert_eq!(skip_list.next(), None);
@@ -71,7 +71,7 @@ mod tests {
    #[test]
    fn test_skiplist5() {
        let mut output: Vec<u8> = Vec::new();
-        let mut skip_list_builder: SkipListBuilder<()> = SkipListBuilder::new(4);
+        let mut skip_list_builder: SkipListBuilder<()> = SkipListBuilder::new(3);
        skip_list_builder.insert(2, &()).unwrap();
        skip_list_builder.insert(3, &()).unwrap();
        skip_list_builder.insert(5, &()).unwrap();
@@ -103,7 +103,7 @@ mod tests {
    #[test]
    fn test_skiplist7() {
        let mut output: Vec<u8> = Vec::new();
-        let mut skip_list_builder: SkipListBuilder<()> = SkipListBuilder::new(4);
+        let mut skip_list_builder: SkipListBuilder<()> = SkipListBuilder::new(3);
        for i in 0..1000 {
            skip_list_builder.insert(i, &()).unwrap();
        }
@@ -121,48 +121,35 @@ mod tests {
    #[test]
    fn test_skiplist8() {
        let mut output: Vec<u8> = Vec::new();
-        let mut skip_list_builder: SkipListBuilder<u64> = SkipListBuilder::new(8);
+        let mut skip_list_builder: SkipListBuilder<u32> = SkipListBuilder::new(10);
        skip_list_builder.insert(2, &3).unwrap();
        skip_list_builder.write::<Vec<u8>>(&mut output).unwrap();
-        assert_eq!(output.len(), 11);
+        assert_eq!(output.len(), 13);
        assert_eq!(output[0], 1u8 + 128u8);
    }

    #[test]
    fn test_skiplist9() {
        let mut output: Vec<u8> = Vec::new();
-        let mut skip_list_builder: SkipListBuilder<u64> = SkipListBuilder::new(4);
-        for i in 0..4 * 4 * 4 {
+        let mut skip_list_builder: SkipListBuilder<u32> = SkipListBuilder::new(3);
+        for i in 0..9 {
            skip_list_builder.insert(i, &i).unwrap();
        }
        skip_list_builder.write::<Vec<u8>>(&mut output).unwrap();
-        assert_eq!(output.len(), 774);
-        assert_eq!(output[0], 4u8 + 128u8);
+        assert_eq!(output.len(), 117);
+        assert_eq!(output[0], 3u8 + 128u8);
    }

    #[test]
    fn test_skiplist10() {
        // checking that void gets serialized to nothing.
        let mut output: Vec<u8> = Vec::new();
-        let mut skip_list_builder: SkipListBuilder<()> = SkipListBuilder::new(4);
-        for i in 0..((4 * 4 * 4) - 1) {
+        let mut skip_list_builder: SkipListBuilder<()> = SkipListBuilder::new(3);
+        for i in 0..9 {
            skip_list_builder.insert(i, &()).unwrap();
        }
        skip_list_builder.write::<Vec<u8>>(&mut output).unwrap();
-        assert_eq!(output.len(), 230);
-        assert_eq!(output[0], 128u8 + 3u8);
-    }
-
-    #[test]
-    fn test_skiplist11() {
-        // checking that void gets serialized to nothing.
-        let mut output: Vec<u8> = Vec::new();
-        let mut skip_list_builder: SkipListBuilder<()> = SkipListBuilder::new(4);
-        for i in 0..(4 * 4) {
-            skip_list_builder.insert(i, &()).unwrap();
-        }
-        skip_list_builder.write::<Vec<u8>>(&mut output).unwrap();
-        assert_eq!(output.len(), 65);
+        assert_eq!(output.len(), 81);
        assert_eq!(output[0], 128u8 + 3u8);
    }

--- a/src/datastruct/skip/skiplist.rs
+++ b/src/datastruct/skip/skiplist.rs
@@ -1,5 +1,6 @@
-use common::{BinarySerializable, VInt};
+use common::BinarySerializable;
 use std::marker::PhantomData;
+use DocId;
 use std::cmp::max;

 static EMPTY: [u8; 0] = [];
@@ -7,20 +8,21 @@ static EMPTY: [u8; 0] = [];
 struct Layer<'a, T> {
    data: &'a [u8],
    cursor: &'a [u8],
-    next_id: Option<u64>,
+    next_id: DocId,
    _phantom_: PhantomData<T>,
 }

 impl<'a, T: BinarySerializable> Iterator for Layer<'a, T> {
-    type Item = (u64, T);
+    type Item = (DocId, T);

-    fn next(&mut self) -> Option<(u64, T)> {
-        if let Some(cur_id) = self.next_id {
-            let cur_val = T::deserialize(&mut self.cursor).unwrap();
-            self.next_id = VInt::deserialize_u64(&mut self.cursor).ok();
-            Some((cur_id, cur_val))
-        } else {
+    fn next(&mut self) -> Option<(DocId, T)> {
+        if self.next_id == u32::max_value() {
            None
+        } else {
+            let cur_val = T::deserialize(&mut self.cursor).unwrap();
+            let cur_id = self.next_id;
+            self.next_id = u32::deserialize(&mut self.cursor).unwrap_or(u32::max_value());
+            Some((cur_id, cur_val))
        }
    }
 }
@@ -28,7 +30,7 @@ impl<'a, T: BinarySerializable> Iterator for Layer<'a, T> {
 impl<'a, T: BinarySerializable> From<&'a [u8]> for Layer<'a, T> {
    fn from(data: &'a [u8]) -> Layer<'a, T> {
        let mut cursor = data;
-        let next_id = VInt::deserialize_u64(&mut cursor).ok();
+        let next_id = u32::deserialize(&mut cursor).unwrap_or(u32::max_value());
        Layer {
            data,
            cursor,
@@ -43,14 +45,14 @@ impl<'a, T: BinarySerializable> Layer<'a, T> {
        Layer {
            data: &EMPTY,
            cursor: &EMPTY,
-            next_id: None,
+            next_id: DocId::max_value(),
            _phantom_: PhantomData,
        }
    }

    fn seek_offset(&mut self, offset: usize) {
        self.cursor = &self.data[offset..];
-        self.next_id = VInt::deserialize_u64(&mut self.cursor).ok();
+        self.next_id = u32::deserialize(&mut self.cursor).unwrap_or(u32::max_value());
    }

    // Returns the last element (key, val)
@@ -58,61 +60,54 @@ impl<'a, T: BinarySerializable> Layer<'a, T> {
    //
    // If there is no such element anymore,
    // returns None.
-    //
-    // If the element exists, it will be returned
-    // at the next call to `.next()`.
-    fn seek(&mut self, key: u64) -> Option<(u64, T)> {
-        let mut result: Option<(u64, T)> = None;
-        loop {
-            if let Some(next_id) = self.next_id {
-                if next_id < key {
-                    if let Some(v) = self.next() {
-                        result = Some(v);
-                        continue;
-                    }
+    fn seek(&mut self, doc_id: DocId) -> Option<(DocId, T)> {
+        let mut val = None;
+        while self.next_id < doc_id {
+            match self.next() {
+                None => {
+                    break;
+                }
+                v => {
+                    val = v;
                }
            }
-            return result;
        }
+        val
    }
 }

 pub struct SkipList<'a, T: BinarySerializable> {
    data_layer: Layer<'a, T>,
-    skip_layers: Vec<Layer<'a, u64>>,
+    skip_layers: Vec<Layer<'a, u32>>,
 }

 impl<'a, T: BinarySerializable> Iterator for SkipList<'a, T> {
-    type Item = (u64, T);
+    type Item = (DocId, T);

-    fn next(&mut self) -> Option<(u64, T)> {
+    fn next(&mut self) -> Option<(DocId, T)> {
        self.data_layer.next()
    }
 }

 impl<'a, T: BinarySerializable> SkipList<'a, T> {
-    pub fn seek(&mut self, key: u64) -> Option<(u64, T)> {
-        let mut next_layer_skip: Option<(u64, u64)> = None;
+    pub fn seek(&mut self, doc_id: DocId) -> Option<(DocId, T)> {
+        let mut next_layer_skip: Option<(DocId, u32)> = None;
        for skip_layer in &mut self.skip_layers {
            if let Some((_, offset)) = next_layer_skip {
                skip_layer.seek_offset(offset as usize);
            }
-            next_layer_skip = skip_layer.seek(key);
+            next_layer_skip = skip_layer.seek(doc_id);
        }
        if let Some((_, offset)) = next_layer_skip {
            self.data_layer.seek_offset(offset as usize);
        }
-        self.data_layer.seek(key)
+        self.data_layer.seek(doc_id)
    }
 }

 impl<'a, T: BinarySerializable> From<&'a [u8]> for SkipList<'a, T> {
    fn from(mut data: &'a [u8]) -> SkipList<'a, T> {
-        let offsets: Vec<u64> = Vec::<VInt>::deserialize(&mut data)
-            .unwrap()
-            .into_iter()
-            .map(|el| el.0)
-            .collect();
+        let offsets: Vec<u32> = Vec::deserialize(&mut data).unwrap();
        let num_layers = offsets.len();
        let layers_data: &[u8] = data;
        let data_layer: Layer<'a, T> = if num_layers == 0 {
--- a/src/datastruct/skip/skiplist_builder.rs
+++ b/src/datastruct/skip/skiplist_builder.rs
@@ -1,11 +1,13 @@
 use std::io::Write;
-use common::{BinarySerializable, VInt, is_power_of_2};
+use common::BinarySerializable;
 use std::marker::PhantomData;
+use DocId;
 use std::io;

 struct LayerBuilder<T: BinarySerializable> {
-    period_mask: usize,
+    period: usize,
    buffer: Vec<u8>,
+    remaining: usize,
    len: usize,
    _phantom_: PhantomData<T>,
 }
@@ -21,33 +23,34 @@ impl<T: BinarySerializable> LayerBuilder<T> {
    }

    fn with_period(period: usize) -> LayerBuilder<T> {
-        assert!(is_power_of_2(period), "The period has to be a power of 2.");
        LayerBuilder {
-            period_mask: (period - 1),
+            period,
            buffer: Vec::new(),
+            remaining: period,
            len: 0,
            _phantom_: PhantomData,
        }
    }

-    fn insert(&mut self, key: u64, value: &T) -> io::Result<Option<(u64, u64)>> {
+    fn insert(&mut self, doc_id: DocId, value: &T) -> io::Result<Option<(DocId, u32)>> {
+        self.remaining -= 1;
        self.len += 1;
-        let offset = self.written_size() as u64;
-        VInt(key).serialize(&mut self.buffer)?;
+        let offset = self.written_size() as u32;
+        doc_id.serialize(&mut self.buffer)?;
        value.serialize(&mut self.buffer)?;
-        let emit_skip_info = (self.period_mask & self.len) == 0;
-        if emit_skip_info {
-            Ok(Some((key, offset)))
+        Ok(if self.remaining == 0 {
+            self.remaining = self.period;
+            Some((doc_id, offset))
        } else {
-            Ok(None)
-        }
+            None
+        })
    }
 }

 pub struct SkipListBuilder<T: BinarySerializable> {
    period: usize,
    data_layer: LayerBuilder<T>,
-    skip_layers: Vec<LayerBuilder<u64>>,
+    skip_layers: Vec<LayerBuilder<u32>>,
 }

 impl<T: BinarySerializable> SkipListBuilder<T> {
@@ -59,7 +62,7 @@ impl<T: BinarySerializable> SkipListBuilder<T> {
        }
    }

-    fn get_skip_layer(&mut self, layer_id: usize) -> &mut LayerBuilder<u64> {
+    fn get_skip_layer(&mut self, layer_id: usize) -> &mut LayerBuilder<u32> {
        if layer_id == self.skip_layers.len() {
            let layer_builder = LayerBuilder::with_period(self.period);
            self.skip_layers.push(layer_builder);
@@ -67,9 +70,9 @@ impl<T: BinarySerializable> SkipListBuilder<T> {
        &mut self.skip_layers[layer_id]
    }

-    pub fn insert(&mut self, key: u64, dest: &T) -> io::Result<()> {
+    pub fn insert(&mut self, doc_id: DocId, dest: &T) -> io::Result<()> {
        let mut layer_id = 0;
-        let mut skip_pointer = self.data_layer.insert(key, dest)?;
+        let mut skip_pointer = self.data_layer.insert(doc_id, dest)?;
        loop {
            skip_pointer = match skip_pointer {
                Some((skip_doc_id, skip_offset)) => self.get_skip_layer(layer_id)
@@ -83,11 +86,13 @@ impl<T: BinarySerializable> SkipListBuilder<T> {
    }

    pub fn write<W: Write>(self, output: &mut W) -> io::Result<()> {
-        let mut size: u64 = self.data_layer.buffer.len() as u64;
-        let mut layer_sizes = vec![VInt(size)];
+        let mut size: u32 = 0;
+        let mut layer_sizes: Vec<u32> = Vec::new();
+        size += self.data_layer.buffer.len() as u32;
+        layer_sizes.push(size);
        for layer in self.skip_layers.iter().rev() {
-            size += layer.buffer.len() as u64;
-            layer_sizes.push(VInt(size));
+            size += layer.buffer.len() as u32;
+            layer_sizes.push(size);
        }
        layer_sizes.serialize(output)?;
        self.data_layer.write(output)?;
--- a/src/datastruct/stacker/hashmap.rs
+++ b/src/datastruct/stacker/hashmap.rs
@@ -1,6 +1,5 @@
 use std::iter;
 use std::mem;
-use postings::UnorderedTermId;
 use super::heap::{BytesRef, Heap, HeapAllocable};

 mod murmurhash2 {
@@ -59,8 +58,10 @@ mod murmurhash2 {
 ///
 /// Returns (the heap size in bytes, the hash table size in number of bits)
 pub(crate) fn split_memory(per_thread_memory_budget: usize) -> (usize, usize) {
-    let table_size_limit: usize = per_thread_memory_budget / 3;
-    let compute_table_size = |num_bits: usize| (1 << num_bits) * mem::size_of::<KeyValue>();
+    let table_size_limit: usize = per_thread_memory_budget / 5;
+    let compute_table_size = |num_bits: usize| {
+        (1 << num_bits) * mem::size_of::<KeyValue>()
+    };
    let table_num_bits: usize = (1..)
        .into_iter()
        .take_while(|num_bits: &usize| compute_table_size(*num_bits) < table_size_limit)
@@ -102,7 +103,7 @@ impl KeyValue {
 /// the computation of the hash of the key twice,
 /// or copying the key as long as there is no insert.
 ///
-pub struct TermHashMap<'a> {
+pub struct HashMap<'a> {
    table: Box<[KeyValue]>,
    heap: &'a Heap,
    mask: usize,
@@ -117,11 +118,7 @@ struct QuadraticProbing {

 impl QuadraticProbing {
    fn compute(hash: usize, mask: usize) -> QuadraticProbing {
-        QuadraticProbing {
-            hash,
-            i: 0,
-            mask,
-        }
+        QuadraticProbing { hash, i: 0, mask }
    }

    #[inline]
@@ -131,11 +128,11 @@ impl QuadraticProbing {
    }
 }

-impl<'a> TermHashMap<'a> {
-    pub fn new(num_bucket_power_of_2: usize, heap: &'a Heap) -> TermHashMap<'a> {
+impl<'a> HashMap<'a> {
+    pub fn new(num_bucket_power_of_2: usize, heap: &'a Heap) -> HashMap<'a> {
        let table_size = 1 << num_bucket_power_of_2;
        let table: Vec<KeyValue> = iter::repeat(KeyValue::default()).take(table_size).collect();
-        TermHashMap {
+        HashMap {
            table: table.into_boxed_slice(),
            heap,
            mask: table_size - 1,
@@ -158,25 +155,22 @@ impl<'a> TermHashMap<'a> {
        (key_bytes, expull_addr)
    }

-    pub fn set_bucket(&mut self, hash: u32, key_value_addr: BytesRef, bucket: usize) {
+    pub fn set_bucket(&mut self, hash: u32, key_bytes_ref: BytesRef, bucket: usize) {
        self.occupied.push(bucket);
        self.table[bucket] = KeyValue {
-            key_value_addr, hash
+            key_value_addr: key_bytes_ref,
+            hash,
        };
    }

-    pub fn iter<'b: 'a>(&'b self) -> impl Iterator<Item = (&'a [u8], u32, UnorderedTermId)> + 'b {
+    pub fn iter<'b: 'a>(&'b self) -> impl Iterator<Item = (&'a [u8], u32)> + 'b {
        self.occupied.iter().cloned().map(move |bucket: usize| {
            let kv = self.table[bucket];
-            let (key, offset) = self.get_key_value(kv.key_value_addr);
-            (key, offset, bucket as UnorderedTermId)
+            self.get_key_value(kv.key_value_addr)
        })
    }

-    pub fn get_or_create<S: AsRef<[u8]>, V: HeapAllocable>(
-        &mut self,
-        key: S,
-    ) -> (UnorderedTermId, &mut V) {
+    pub fn get_or_create<S: AsRef<[u8]>, V: HeapAllocable>(&mut self, key: S) -> &mut V {
        let key_bytes: &[u8] = key.as_ref();
        let hash = murmurhash2::murmurhash2(key.as_ref());
        let mut probe = self.probe(hash);
@@ -188,14 +182,11 @@ impl<'a> TermHashMap<'a> {
                let (addr, val): (u32, &mut V) = self.heap.allocate_object();
                assert_eq!(addr, key_bytes_ref.addr() + 2 + key_bytes.len() as u32);
                self.set_bucket(hash, key_bytes_ref, bucket);
-                return (bucket as UnorderedTermId, val);
+                return val;
            } else if kv.hash == hash {
                let (stored_key, expull_addr): (&[u8], u32) = self.get_key_value(kv.key_value_addr);
                if stored_key == key_bytes {
-                    return (
-                        bucket as UnorderedTermId,
-                        self.heap.get_mut_ref(expull_addr),
-                    );
+                    return self.heap.get_mut_ref(expull_addr);
                }
            }
        }
@@ -228,41 +219,41 @@ mod tests {

    #[test]
    fn test_hashmap_size() {
-        assert_eq!(split_memory(100_000), (67232, 12));
-        assert_eq!(split_memory(1_000_000), (737856, 15));
-        assert_eq!(split_memory(10_000_000), (7902848, 18));
+        assert_eq!(split_memory(100_000), (67232, 9));
+        assert_eq!(split_memory(1_000_000), (737856, 12));
+        assert_eq!(split_memory(10_000_000), (7902848, 15));
    }

    #[test]
    fn test_hash_map() {
        let heap = Heap::with_capacity(2_000_000);
-        let mut hash_map: TermHashMap = TermHashMap::new(18, &heap);
+        let mut hash_map: HashMap = HashMap::new(18, &heap);
        {
-            let v: &mut TestValue = hash_map.get_or_create("abc").1;
+            let v: &mut TestValue = hash_map.get_or_create("abc");
            assert_eq!(v.val, 0u32);
            v.val = 3u32;
        }
        {
-            let v: &mut TestValue = hash_map.get_or_create("abcd").1;
+            let v: &mut TestValue = hash_map.get_or_create("abcd");
            assert_eq!(v.val, 0u32);
            v.val = 4u32;
        }
        {
-            let v: &mut TestValue = hash_map.get_or_create("abc").1;
+            let v: &mut TestValue = hash_map.get_or_create("abc");
            assert_eq!(v.val, 3u32);
        }
        {
-            let v: &mut TestValue = hash_map.get_or_create("abcd").1;
+            let v: &mut TestValue = hash_map.get_or_create("abcd");
            assert_eq!(v.val, 4u32);
        }
        let mut iter_values = hash_map.iter();
        {
-            let (_, addr, _) = iter_values.next().unwrap();
+            let (_, addr) = iter_values.next().unwrap();
            let val: &TestValue = heap.get_ref(addr);
            assert_eq!(val.val, 3u32);
        }
        {
-            let (_, addr, _) = iter_values.next().unwrap();
+            let (_, addr) = iter_values.next().unwrap();
            let val: &TestValue = heap.get_ref(addr);
            assert_eq!(val.val, 4u32);
        }
--- a/src/datastruct/stacker/mod.rs
+++ b/src/datastruct/stacker/mod.rs
@@ -4,7 +4,7 @@ mod expull;

 pub use self::heap::{Heap, HeapAllocable};
 pub use self::expull::ExpUnrolledLinkedList;
-pub use self::hashmap::TermHashMap;
+pub use self::hashmap::HashMap;

 #[test]
 fn test_unrolled_linked_list() {
@@ -16,15 +16,15 @@ fn test_unrolled_linked_list() {
        ks.push(2);
        ks.push(3);
        for k in (1..5).map(|k| k * 100) {
-            let mut hashmap: TermHashMap = TermHashMap::new(10, &heap);
+            let mut hashmap: HashMap = HashMap::new(10, &heap);
            for j in 0..k {
                for i in 0..500 {
-                    let v: &mut ExpUnrolledLinkedList = hashmap.get_or_create(i.to_string()).1;
+                    let v: &mut ExpUnrolledLinkedList = hashmap.get_or_create(i.to_string());
                    v.push(i * j, &heap);
                }
            }
            let mut map_addr: collections::HashMap<Vec<u8>, u32> = collections::HashMap::new();
-            for (key, addr, _) in hashmap.iter() {
+            for (key, addr) in hashmap.iter() {
                map_addr.insert(Vec::from(key), addr);
            }

--- a/src/directory/managed_directory.rs
+++ b/src/directory/managed_directory.rs
@@ -282,7 +282,6 @@ impl Clone for ManagedDirectory {
 mod tests {

    use super::*;
-    #[cfg(feature="mmap")]
    use directory::MmapDirectory;
    use std::path::Path;
    use std::io::Write;
@@ -294,7 +293,6 @@ mod tests {
    }

    #[test]
-    #[cfg(feature="mmap")]
    fn test_managed_directory() {
        let tempdir = TempDir::new("index").unwrap();
        let tempdir_path = PathBuf::from(tempdir.path());
@@ -343,7 +341,6 @@ mod tests {
    }

    #[test]
-    #[cfg(feature="mmap ")]
    fn test_managed_directory_gc_while_mmapped() {
        let tempdir = TempDir::new("index").unwrap();
        let tempdir_path = PathBuf::from(tempdir.path());
@@ -373,7 +370,6 @@ mod tests {
    }

    #[test]
-    #[cfg(feature="mmap")]
    fn test_managed_directory_protect() {
        let tempdir = TempDir::new("index").unwrap();
        let tempdir_path = PathBuf::from(tempdir.path());
--- a/src/directory/mmap_directory.rs
+++ b/src/directory/mmap_directory.rs
@@ -6,6 +6,7 @@ use directory::ReadOnlySource;
 use directory::shared_vec_slice::SharedVecSlice;
 use directory::WritePtr;
 use fst::raw::MmapReadOnly;
+use memmap::{Mmap, Protection};
 use std::collections::hash_map::Entry as HashMapEntry;
 use std::collections::HashMap;
 use std::convert::From;
@@ -14,17 +15,16 @@ use std::fs::{self, File};
 use std::fs::OpenOptions;
 use std::io::{self, Seek, SeekFrom};
 use std::io::{BufWriter, Read, Write};
+use std::mem;
 use std::path::{Path, PathBuf};
 use std::result;
 use std::sync::Arc;
 use std::sync::RwLock;
+use std::sync::Weak;
 use tempdir::TempDir;

-/// Returns None iff the file exists, can be read, but is empty (and hence
-/// cannot be mmapped).
-///
-fn open_mmap(full_path: &Path) -> result::Result<Option<MmapReadOnly>, OpenReadError> {
-    let file = File::open(full_path).map_err(|e| {
+fn open_mmap(full_path: &Path) -> result::Result<Option<Arc<Mmap>>, OpenReadError> {
+    let file = File::open(&full_path).map_err(|e| {
        if e.kind() == io::ErrorKind::NotFound {
            OpenReadError::FileDoesNotExist(full_path.to_owned())
        } else {
@@ -36,13 +36,14 @@ fn open_mmap(full_path: &Path) -> result::Result<Option<MmapReadOnly>, OpenReadE
        .map_err(|e| IOError::with_path(full_path.to_owned(), e))?;
    if meta_data.len() == 0 {
        // if the file size is 0, it will not be possible
-        // to mmap the file, so we return None
+        // to mmap the file, so we return an anonymous mmap_cache
        // instead.
        return Ok(None);
    }
-    MmapReadOnly::open(&file)
-        .map(Some)
-        .map_err(|e| From::from(IOError::with_path(full_path.to_owned(), e)))
+    match Mmap::open(&file, Protection::Read) {
+        Ok(mmap) => Ok(Some(Arc::new(mmap))),
+        Err(e) => Err(IOError::with_path(full_path.to_owned(), e))?,
+    }
 }

 #[derive(Default, Clone, Debug, Serialize, Deserialize)]
@@ -51,7 +52,10 @@ pub struct CacheCounters {
    pub hit: usize,
    // Number of time tantivy had to call `mmap`
    // as no entry was in the cache.
-    pub miss: usize,
+    pub miss_empty: usize,
+    // Number of time tantivy had to call `mmap`
+    // as the entry in the cache was evinced.
+    pub miss_weak: usize,
 }

 #[derive(Clone, Debug, Serialize, Deserialize)]
@@ -62,25 +66,38 @@ pub struct CacheInfo {

 struct MmapCache {
    counters: CacheCounters,
-    cache: HashMap<PathBuf, MmapReadOnly>,
+    cache: HashMap<PathBuf, Weak<Mmap>>,
+    purge_weak_limit: usize,
 }

+const STARTING_PURGE_WEAK_LIMIT: usize = 1_000;
+
 impl Default for MmapCache {
    fn default() -> MmapCache {
        MmapCache {
            counters: CacheCounters::default(),
            cache: HashMap::new(),
+            purge_weak_limit: STARTING_PURGE_WEAK_LIMIT,
        }
    }
 }

 impl MmapCache {
-    /// Removes a `MmapReadOnly` entry from the mmap cache.
-    fn discard_from_cache(&mut self, full_path: &Path) -> bool {
-        self.cache.remove(full_path).is_some()
+    fn cleanup(&mut self) {
+        let previous_cache_size = self.cache.len();
+        let mut new_cache = HashMap::new();
+        mem::swap(&mut new_cache, &mut self.cache);
+        self.cache = new_cache
+            .into_iter()
+            .filter(|&(_, ref weak_ref)| weak_ref.upgrade().is_some())
+            .collect();
+        if self.cache.len() == previous_cache_size {
+            self.purge_weak_limit *= 2;
+        }
    }

    fn get_info(&mut self) -> CacheInfo {
+        self.cleanup();
        let paths: Vec<PathBuf> = self.cache.keys().cloned().collect();
        CacheInfo {
            counters: self.counters.clone(),
@@ -88,18 +105,33 @@ impl MmapCache {
        }
    }

-    fn get_mmap(&mut self, full_path: &Path) -> Result<Option<MmapReadOnly>, OpenReadError> {
-        Ok(match self.cache.entry(full_path.to_owned()) {
-            HashMapEntry::Occupied(occupied_entry) => {
-                let mmap = occupied_entry.get();
-                self.counters.hit += 1;
-                Some(mmap.clone())
+    fn get_mmap(&mut self, full_path: &PathBuf) -> Result<Option<Arc<Mmap>>, OpenReadError> {
+        // if we exceed this limit, then we go through the weak
+        // and remove those that are obsolete.
+        if self.cache.len() > self.purge_weak_limit {
+            self.cleanup();
+        }
+        Ok(match self.cache.entry(full_path.clone()) {
+            HashMapEntry::Occupied(mut occupied_entry) => {
+                if let Some(mmap_arc) = occupied_entry.get().upgrade() {
+                    self.counters.hit += 1;
+                    Some(Arc::clone(&mmap_arc))
+                } else {
+                    // The entry exists but the weak ref has been destroyed.
+                    self.counters.miss_weak += 1;
+                    if let Some(mmap_arc) = open_mmap(full_path)? {
+                        occupied_entry.insert(Arc::downgrade(&mmap_arc));
+                        Some(mmap_arc)
+                    } else {
+                        None
+                    }
+                }
            }
            HashMapEntry::Vacant(vacant_entry) => {
-                self.counters.miss += 1;
-                if let Some(mmap) = open_mmap(full_path)? {
-                    vacant_entry.insert(mmap.clone());
-                    Some(mmap)
+                self.counters.miss_empty += 1;
+                if let Some(mmap_arc) = open_mmap(full_path)? {
+                    vacant_entry.insert(Arc::downgrade(&mmap_arc));
+                    Some(mmap_arc)
                } else {
                    None
                }
@@ -196,7 +228,6 @@ impl MmapDirectory {
        fd.sync_all()?;
        Ok(())
    }
-
    /// Returns some statistical information
    /// about the Mmap cache.
    ///
@@ -253,6 +284,7 @@ impl Directory for MmapDirectory {

        Ok(mmap_cache
            .get_mmap(&full_path)?
+            .map(MmapReadOnly::from)
            .map(ReadOnlySource::Mmap)
            .unwrap_or_else(|| ReadOnlySource::Anonymous(SharedVecSlice::empty())))
    }
@@ -287,8 +319,6 @@ impl Directory for MmapDirectory {
        Ok(BufWriter::new(Box::new(writer)))
    }

-    /// Any entry associated to the path in the mmap will be
-    /// removed before the file is deleted.
    fn delete(&self, path: &Path) -> result::Result<(), DeleteError> {
        debug!("Deleting file {:?}", path);
        let full_path = self.resolve_path(path);
@@ -300,8 +330,6 @@ impl Directory for MmapDirectory {
            );
            IOError::with_path(path.to_owned(), make_io_err(msg))
        })?;
-        mmap_cache.discard_from_cache(path);
-
        // Removing the entry in the MMap cache.
        // The munmap will appear on Drop,
        // when the last reference is gone.
@@ -387,8 +415,7 @@ mod tests {
        // here we test if the cache releases
        // mmaps correctly.
        let mut mmap_directory = MmapDirectory::create_from_tempdir().unwrap();
-        let num_paths = 10;
-        let paths: Vec<PathBuf> = (0..num_paths)
+        let paths: Vec<PathBuf> = (0..10)
            .map(|i| PathBuf::from(&*format!("file_{}", i)))
            .collect();
        {
@@ -399,24 +426,49 @@ mod tests {
            }
        }
        {
-            for (i, path) in paths.iter().enumerate() {
-                let _r = mmap_directory.open_read(path).unwrap();
-                assert_eq!(mmap_directory.get_cache_info().mmapped.len(), i + 1);
-            }
-            for path in paths.iter() {
-                let _r = mmap_directory.open_read(path).unwrap();
-                assert_eq!(mmap_directory.get_cache_info().mmapped.len(), num_paths);
-            }
-            for (i, path) in paths.iter().enumerate() {
-                mmap_directory.delete(path).unwrap();
-                assert_eq!(
-                    mmap_directory.get_cache_info().mmapped.len(),
-                    num_paths - i - 1
-                );
+            for path in &paths {
+                {
+                    let _r = mmap_directory.open_read(path).unwrap();
+                    assert_eq!(mmap_directory.get_cache_info().mmapped.len(), 1);
+                }
+                assert_eq!(mmap_directory.get_cache_info().mmapped.len(), 0);
            }
        }
-        assert_eq!(mmap_directory.get_cache_info().counters.hit, 10);
-        assert_eq!(mmap_directory.get_cache_info().counters.miss, 10);
+        assert_eq!(mmap_directory.get_cache_info().counters.miss_empty, 10);
+
+        {
+            // test weak miss
+            // the first pass create the weak refs.
+            for path in &paths {
+                let _r = mmap_directory.open_read(path).unwrap();
+            }
+            // ... the second hits the weak refs.
+            for path in &paths {
+                let _r = mmap_directory.open_read(path).unwrap();
+            }
+            let cache_info = mmap_directory.get_cache_info();
+            assert_eq!(cache_info.counters.miss_empty, 20);
+            assert_eq!(cache_info.counters.miss_weak, 10);
+        }
+
+        {
+            let mut saved_readmmaps = vec![];
+            // Keeps reference alive
+            for (i, path) in paths.iter().enumerate() {
+                let r = mmap_directory.open_read(path).unwrap();
+                saved_readmmaps.push(r);
+                assert_eq!(mmap_directory.get_cache_info().mmapped.len(), i + 1);
+            }
+            let cache_info = mmap_directory.get_cache_info();
+            assert_eq!(cache_info.counters.miss_empty, 30);
+            assert_eq!(cache_info.counters.miss_weak, 10);
+            assert_eq!(cache_info.mmapped.len(), 10);
+
+            for saved_readmmap in saved_readmmaps {
+                assert_eq!(saved_readmmap.as_slice(), content);
+            }
+        }
+
        assert_eq!(mmap_directory.get_cache_info().mmapped.len(), 0);
    }

--- a/src/directory/mod.rs
+++ b/src/directory/mod.rs
@@ -3,29 +3,21 @@
 WORM directory abstraction.

 */
-
-#[cfg(feature="mmap")]
 mod mmap_directory;
-
 mod ram_directory;
 mod directory;
 mod read_only_source;
 mod shared_vec_slice;
 mod managed_directory;
-mod static_directory;

 /// Errors specific to the directory module.
 pub mod error;

 use std::io::{BufWriter, Seek, Write};

-pub use self::static_directory::StaticDirectory;
-pub use self::static_directory::write_static_from_directory;
 pub use self::read_only_source::ReadOnlySource;
 pub use self::directory::Directory;
 pub use self::ram_directory::RAMDirectory;
-
-#[cfg(feature="mmap")]
 pub use self::mmap_directory::MmapDirectory;

 pub(crate) use self::read_only_source::SourceRead;
@@ -59,7 +51,6 @@ mod tests {
    }

    #[test]
-    #[cfg(feature="mmap")]
    fn test_mmap_directory() {
        let mut mmap_directory = MmapDirectory::create_from_tempdir().unwrap();
        test_directory(&mut mmap_directory);
@@ -125,6 +116,9 @@ mod tests {
            assert!(directory.open_read(*TEST_PATH).is_err());
            let _w = directory.open_write(*TEST_PATH).unwrap();
            assert!(directory.exists(*TEST_PATH));
+            if let Err(e) = directory.open_read(*TEST_PATH) {
+                println!("{:?}", e);
+            }
            assert!(directory.open_read(*TEST_PATH).is_ok());
            assert!(directory.delete(*TEST_PATH).is_ok());
        }
--- a/src/directory/read_only_source.rs
+++ b/src/directory/read_only_source.rs
@@ -1,13 +1,10 @@
-#[cfg(feature="mmap")]
 use fst::raw::MmapReadOnly;
 use std::ops::Deref;
 use super::shared_vec_slice::SharedVecSlice;
 use common::HasLen;
 use std::slice;
 use std::io::{self, Read};
-use stable_deref_trait::{CloneStableDeref, StableDeref};
-
-const EMPTY_SLICE: [u8; 0] = [];
+use stable_deref_trait::StableDeref;

 /// Read object that represents files in tantivy.
 ///
@@ -17,16 +14,12 @@ const EMPTY_SLICE: [u8; 0] = [];
 /// hold by this object should never be altered or destroyed.
 pub enum ReadOnlySource {
    /// Mmap source of data
-    #[cfg(feature="mmap")]
    Mmap(MmapReadOnly),
    /// Wrapping a `Vec<u8>`
    Anonymous(SharedVecSlice),
-    /// Wrapping a static slice
-    Static(&'static [u8])
 }

 unsafe impl StableDeref for ReadOnlySource {}
-unsafe impl CloneStableDeref for ReadOnlySource {}

 impl Deref for ReadOnlySource {
    type Target = [u8];
@@ -39,16 +32,14 @@ impl Deref for ReadOnlySource {
 impl ReadOnlySource {
    /// Creates an empty ReadOnlySource
    pub fn empty() -> ReadOnlySource {
-        ReadOnlySource::Static(&EMPTY_SLICE)
+        ReadOnlySource::Anonymous(SharedVecSlice::empty())
    }

    /// Returns the data underlying the ReadOnlySource object.
    pub fn as_slice(&self) -> &[u8] {
        match *self {
-            #[cfg(feature="mmap")]
            ReadOnlySource::Mmap(ref mmap_read_only) => unsafe { mmap_read_only.as_slice() },
            ReadOnlySource::Anonymous(ref shared_vec) => shared_vec.as_slice(),
-            ReadOnlySource::Static(data) => data,
        }
    }

@@ -71,9 +62,7 @@ impl ReadOnlySource {
    /// 1KB slice is remaining, the whole `500MBs`
    /// are retained in memory.
    pub fn slice(&self, from_offset: usize, to_offset: usize) -> ReadOnlySource {
-        assert!(from_offset <= to_offset, "Requested negative slice [{}..{}]", from_offset, to_offset);
        match *self {
-            #[cfg(feature="mmap")]
            ReadOnlySource::Mmap(ref mmap_read_only) => {
                let sliced_mmap = mmap_read_only.range(from_offset, to_offset - from_offset);
                ReadOnlySource::Mmap(sliced_mmap)
@@ -81,9 +70,6 @@ impl ReadOnlySource {
            ReadOnlySource::Anonymous(ref shared_vec) => {
                ReadOnlySource::Anonymous(shared_vec.slice(from_offset, to_offset))
            }
-            ReadOnlySource::Static(data) => {
-                ReadOnlySource::Static(&data[from_offset..to_offset])
-            }
        }
    }

@@ -124,12 +110,6 @@ impl From<Vec<u8>> for ReadOnlySource {
    }
 }

-impl From<&'static [u8]> for ReadOnlySource {
-    fn from(data: &'static [u8]) -> ReadOnlySource {
-        ReadOnlySource::Static(data)
-    }
-}
-
 /// Acts as a owning cursor over the data backed up by a `ReadOnlySource`
 pub(crate) struct SourceRead {
    _data_owner: ReadOnlySource,
@@ -141,16 +121,6 @@ impl SourceRead {
    pub fn advance(&mut self, len: usize) {
        self.cursor = &self.cursor[len..];
    }
-
-    pub fn slice_from(&self, start: usize) -> &[u8] {
-        &self.cursor[start..]
-
-    }
-
-    pub fn get(&self, idx: usize) -> u8 {
-        self.cursor[idx]
-    }
-
 }

 impl AsRef<[u8]> for SourceRead {
--- a/src/directory/static_directory.rs
+++ b/src/directory/static_directory.rs
@@ -1,123 +0,0 @@
-use std::collections::HashMap;
-use Directory;
-use std::path::PathBuf;
-use directory::ReadOnlySource;
-use std::io::BufWriter;
-use directory::error::{DeleteError, OpenReadError, OpenWriteError};
-use std::path::Path;
-use std::fmt::{Formatter, Debug, self};
-use Result as TantivyResult;
-use directory::SeekableWrite;
-use std::io;
-use std::fs;
-use common::Endianness;
-use common::BinarySerializable;
-use common::VInt;
-use byteorder::ByteOrder;
-use std::str;
-use std::fs::File;
-use std::io::{Read, Write};
-use std::ffi::OsString;
-
-#[derive(Clone)]
-pub struct StaticDirectory {
-    files: HashMap<PathBuf, &'static [u8]>,
-}
-
-impl Debug for StaticDirectory {
-    fn fmt(&self, f: &mut Formatter) -> Result<(), fmt::Error> {
-        write!(f, "StaticDirectory[{} files]", self.files.len())?;
-        Ok(())
-    }
-}
-
-impl StaticDirectory {
-    pub fn open(mut data: &'static [u8]) -> TantivyResult<StaticDirectory> {
-        assert!(data.len() > 8);
-        let footer_len_offset = data.len() - 8;
-        let body_len = Endianness::read_u64(&data[footer_len_offset..]) as usize;
-        let mut body = &data[..body_len];
-        let mut footer = &data[body_len..footer_len_offset];
-        let num_files = VInt::deserialize(&mut footer)?.0 as usize;
-        let mut files = HashMap::new();
-        for _ in 0..num_files {
-            let filename_len = VInt::deserialize(&mut footer)?.0 as usize;
-            let filename = &footer[..filename_len];
-            footer = &footer[filename_len..];
-            let data_len = VInt::deserialize(&mut footer)?.0 as usize;
-            let file_data = &body[..data_len];
-            body = &body[data_len..];
-            let filename_str = str::from_utf8(filename).expect("Invalid UTF8");
-            let filename = PathBuf::from(filename_str);
-            println!("{:?} {:?}", filename, data_len);
-            files.insert(filename, file_data);
-        }
-        Ok(StaticDirectory {
-            files
-        })
-    }
-}
-
-impl Directory for StaticDirectory {
-    fn open_read(&self, path: &Path) -> Result<ReadOnlySource, OpenReadError> {
-        if let Some(static_data) = self.files.get(path) {
-            Ok(ReadOnlySource::from(*static_data))
-        } else {
-            Err(OpenReadError::FileDoesNotExist(path.to_owned()))
-        }
-    }
-
-    fn delete(&self, path: &Path) -> Result<(), DeleteError> {
-        unimplemented!("Static directory is read-only !")
-    }
-
-    fn exists(&self, path: &Path) -> bool {
-        self.files.contains_key(path)
-    }
-
-    fn open_write(&mut self, path: &Path) -> Result<BufWriter<Box<SeekableWrite>>, OpenWriteError> {
-        unimplemented!("Static directory is read-only !")
-    }
-
-    fn atomic_read(&self, path: &Path) -> Result<Vec<u8>, OpenReadError> {
-        if let Some(static_data) = self.files.get(path) {
-            Ok(static_data.to_vec())
-        } else {
-            Err(OpenReadError::FileDoesNotExist(path.to_owned()))
-        }
-    }
-
-    fn atomic_write(&mut self, path: &Path, data: &[u8]) -> io::Result<()> {
-        unimplemented!("Static directory is read-only !")
-    }
-
-    fn box_clone(&self) -> Box<Directory> {
-        box self.clone()
-    }
-}
-
-pub fn write_static_from_directory(directory_path: &Path) -> TantivyResult<Vec<u8>> {
-    assert!(directory_path.is_dir());
-    let mut file_data: Vec<(OsString, usize)> = Vec::new();
-    let mut write: Vec<u8> = Vec::new();
-    for entry in fs::read_dir(directory_path)? {
-        let entry = entry?;
-        let path = entry.path();
-        if path.is_file() {
-            info!("Appending {}", path.to_string_lossy());
-            let mut open_file = File::open(&path)?;
-            let file_len = open_file.read_to_end(&mut write)?;
-            file_data.push((entry.file_name(), file_len));
-        }
-    }
-    // write footer
-    let body_len = write.len();
-    VInt(file_data.len() as u64).serialize(&mut write)?;
-    for (filename, filelen) in file_data {
-        VInt(filename.len() as u64).serialize(&mut write)?;
-        write.write_all(filename.to_string_lossy().as_bytes())?;
-        VInt(filelen as u64).serialize(&mut write)?;
-    }
-    (body_len as u64).serialize(&mut write)?;
-    Ok(write)
-}
--- a/src/fastfield/delete.rs
+++ b/src/fastfield/delete.rs
@@ -51,7 +51,21 @@ impl DeleteBitSet {
        }
    }

-    /// Returns whether the document has been marked as deleted.
+    /// Returns an empty delete bit set.
+    pub fn empty() -> DeleteBitSet {
+        DeleteBitSet {
+            data: ReadOnlySource::empty(),
+            len: 0,
+        }
+    }
+
+    /// Returns true iff the segment has some deleted documents.
+    pub fn has_deletes(&self) -> bool {
+        self.len() > 0
+    }
+
+    /// Returns true iff the document is deleted.
+    #[inline]
    pub fn is_deleted(&self, doc: DocId) -> bool {
        if self.len == 0 {
            false
@@ -62,10 +76,8 @@ impl DeleteBitSet {
            b & (1u8 << shift) != 0
        }
    }
-
 }

-
 impl HasLen for DeleteBitSet {
    fn len(&self) -> usize {
        self.len
--- a/src/fastfield/facet_reader.rs
+++ b/src/fastfield/facet_reader.rs
@@ -1,68 +0,0 @@
-use super::MultiValueIntFastFieldReader;
-use DocId;
-use termdict::TermOrdinal;
-use schema::Facet;
-use termdict::{TermDictionary, TermDictionaryImpl};
-
-/// The facet reader makes it possible to access the list of
-/// facets associated to a given document in a specific
-/// segment.
-///
-/// Rather than manipulating `Facet` object directly, the API
-/// exposes those in the form of list of `Facet` ordinal.
-///
-/// A segment ordinal can then be translated into a facet via
-/// `.facet_from_ord(...)`.
-///
-/// Facet ordinals are defined as their position in the sorted
-/// list of facets. This ordinal is segment local and
-/// only makes sense for a given segment.
-pub struct FacetReader {
-    term_ords: MultiValueIntFastFieldReader<u64>,
-    term_dict: TermDictionaryImpl,
-}
-
-impl FacetReader {
-    /// Creates a new `FacetReader`.
-    ///
-    /// A facet reader just wraps :
-    /// - a `MultiValueIntFastFieldReader` that makes it possible to
-    /// access the list of facet ords for a given document.
-    /// - a `TermDictionaryImpl` that helps associating a facet to
-    /// an ordinal and vice versa.
-    pub fn new(
-        term_ords: MultiValueIntFastFieldReader<u64>,
-        term_dict: TermDictionaryImpl,
-    ) -> FacetReader {
-        FacetReader {
-            term_ords,
-            term_dict,
-        }
-    }
-
-    /// Returns the size of the sets of facets in the segment.
-    /// This does not take in account the documents that may be marked
-    /// as deleted.
-    ///
-    /// `Facet` ordinals range from `0` to `num_facets() - 1`.
-    pub fn num_facets(&self) -> usize {
-        self.term_dict.num_terms()
-    }
-
-    /// Accessor for the facet term dictionary.
-    pub fn facet_dict(&self) -> &TermDictionaryImpl {
-        &self.term_dict
-    }
-
-    /// Given a term ordinal returns the term associated to it.
-    pub fn facet_from_ord(&self, facet_ord: TermOrdinal, output: &mut Facet) {
-        let found_term = self.term_dict
-            .ord_to_term(facet_ord as u64, output.inner_buffer_mut());
-        assert!(found_term, "Term ordinal {} no found.", facet_ord);
-    }
-
-    /// Return the list of facet ordinals associated to a document.
-    pub fn facet_ords(&mut self, doc: DocId, output: &mut Vec<u64>) {
-        self.term_ords.get_vals(doc, output);
-    }
-}
--- a/src/fastfield/mod.rs
+++ b/src/fastfield/mod.rs
@@ -23,119 +23,36 @@ values stored.
 Read access performance is comparable to that of an array lookup.
 */

-use common;
-use schema::Cardinality;
-use schema::FieldType;
-use schema::Value;
-pub use self::delete::DeleteBitSet;
-pub use self::delete::write_delete_bitset;
-pub use self::error::{FastFieldNotAvailableError, Result};
-pub use self::facet_reader::FacetReader;
-pub use self::multivalued::MultiValueIntFastFieldReader;
-pub use self::reader::FastFieldReader;
-pub use self::serializer::FastFieldSerializer;
-pub use self::writer::{FastFieldsWriter, IntFastFieldWriter};
-
 mod reader;
 mod writer;
 mod serializer;
 mod error;
 mod delete;
-mod facet_reader;
-mod multivalued;

-/// Trait for types that are allowed for fast fields: (u64 or i64).
-pub trait FastValue: Default + Clone + Copy {
-    /// Converts a value from u64
-    ///
-    /// Internally all fast field values are encoded as u64.
-    fn from_u64(val: u64) -> Self;
-
-    /// Converts a value to u64.
-    ///
-    /// Internally all fast field values are encoded as u64.
-    fn to_u64(&self) -> u64;
-
-    /// Returns the fast field cardinality that can be extracted from the given
-    /// `FieldType`.
-    ///
-    /// If the type is not a fast field, `None` is returned.
-    fn fast_field_cardinality(field_type: &FieldType) -> Option<Cardinality>;
-
-    /// Cast value to `u64`.
-    /// The value is just reinterpreted in memory.
-    fn as_u64(&self) -> u64;
-}
-
-impl FastValue for u64 {
-    fn from_u64(val: u64) -> Self {
-        val
-    }
-
-    fn to_u64(&self) -> u64 {
-        *self
-    }
-
-    fn as_u64(&self) -> u64 {
-        *self
-    }
-
-    fn fast_field_cardinality(field_type: &FieldType) -> Option<Cardinality> {
-        match *field_type {
-            FieldType::U64(ref integer_options) => integer_options.get_fastfield_cardinality(),
-            FieldType::HierarchicalFacet => Some(Cardinality::MultiValues),
-            _ => None,
-        }
-    }
-}
-
-impl FastValue for i64 {
-    fn from_u64(val: u64) -> Self {
-        common::u64_to_i64(val)
-    }
-
-    fn to_u64(&self) -> u64 {
-        common::i64_to_u64(*self)
-    }
-
-    fn fast_field_cardinality(field_type: &FieldType) -> Option<Cardinality> {
-        match *field_type {
-            FieldType::I64(ref integer_options) => integer_options.get_fastfield_cardinality(),
-            _ => None,
-        }
-    }
-
-    fn as_u64(&self) -> u64 {
-        *self as u64
-    }
-}
-
-fn value_to_u64(value: &Value) -> u64 {
-    match *value {
-        Value::U64(ref val) => *val,
-        Value::I64(ref val) => common::i64_to_u64(*val),
-        _ => panic!("Expected a u64/i64 field, got {:?} ", value),
-    }
-}
+pub use self::delete::write_delete_bitset;
+pub use self::delete::DeleteBitSet;
+pub use self::writer::{FastFieldsWriter, IntFastFieldWriter};
+pub use self::reader::{I64FastFieldReader, U64FastFieldReader};
+pub use self::reader::FastFieldReader;
+pub use self::serializer::FastFieldSerializer;
+pub use self::error::{FastFieldNotAvailableError, Result};

 #[cfg(test)]
 mod tests {
-
-    use common::CompositeFile;
+    use super::*;
+    use schema::Field;
+    use std::path::Path;
    use directory::{Directory, RAMDirectory, WritePtr};
+    use schema::Document;
+    use schema::{Schema, SchemaBuilder};
+    use schema::FAST;
+    use test::Bencher;
+    use test;
    use fastfield::FastFieldReader;
    use rand::Rng;
    use rand::SeedableRng;
+    use common::CompositeFile;
    use rand::XorShiftRng;
-    use schema::{Schema, SchemaBuilder};
-    use schema::Document;
-    use schema::FAST;
-    use schema::Field;
-    use std::collections::HashMap;
-    use std::path::Path;
-    use super::*;
-    use test;
-    use test::Bencher;

    lazy_static! {
        static ref SCHEMA: Schema = {
@@ -148,9 +65,15 @@ mod tests {
        };
    }

+    fn add_single_field_doc(fast_field_writers: &mut FastFieldsWriter, field: Field, value: u64) {
+        let mut doc = Document::default();
+        doc.add_u64(field, value);
+        fast_field_writers.add_document(&doc);
+    }
+
    #[test]
    pub fn test_fastfield() {
-        let test_fastfield = FastFieldReader::<u64>::from(vec![100, 200, 300]);
+        let test_fastfield = U64FastFieldReader::from(vec![100, 200, 300]);
        assert_eq!(test_fastfield.get(0), 100);
        assert_eq!(test_fastfield.get(1), 200);
        assert_eq!(test_fastfield.get(2), 300);
@@ -164,22 +87,20 @@ mod tests {
            let write: WritePtr = directory.open_write(Path::new("test")).unwrap();
            let mut serializer = FastFieldSerializer::from_write(write).unwrap();
            let mut fast_field_writers = FastFieldsWriter::from_schema(&SCHEMA);
-            fast_field_writers.add_document(&doc!(*FIELD=>13u64));
-            fast_field_writers.add_document(&doc!(*FIELD=>14u64));
-            fast_field_writers.add_document(&doc!(*FIELD=>2u64));
-            fast_field_writers
-                .serialize(&mut serializer, &HashMap::new())
-                .unwrap();
+            add_single_field_doc(&mut fast_field_writers, *FIELD, 13u64);
+            add_single_field_doc(&mut fast_field_writers, *FIELD, 14u64);
+            add_single_field_doc(&mut fast_field_writers, *FIELD, 2u64);
+            fast_field_writers.serialize(&mut serializer).unwrap();
            serializer.close().unwrap();
        }
        let source = directory.open_read(&path).unwrap();
        {
-            assert_eq!(source.len(), 36 as usize);
+            assert_eq!(source.len(), 35 as usize);
        }
        {
            let composite_file = CompositeFile::open(&source).unwrap();
            let field_source = composite_file.open_read(*FIELD).unwrap();
-            let fast_field_reader = FastFieldReader::<u64>::open(field_source);
+            let fast_field_reader: U64FastFieldReader = U64FastFieldReader::open(field_source);
            assert_eq!(fast_field_reader.get(0), 13u64);
            assert_eq!(fast_field_reader.get(1), 14u64);
            assert_eq!(fast_field_reader.get(2), 2u64);
@@ -194,28 +115,26 @@ mod tests {
            let write: WritePtr = directory.open_write(Path::new("test")).unwrap();
            let mut serializer = FastFieldSerializer::from_write(write).unwrap();
            let mut fast_field_writers = FastFieldsWriter::from_schema(&SCHEMA);
-            fast_field_writers.add_document(&doc!(*FIELD=>4u64));
-            fast_field_writers.add_document(&doc!(*FIELD=>14_082_001u64));
-            fast_field_writers.add_document(&doc!(*FIELD=>3_052u64));
-            fast_field_writers.add_document(&doc!(*FIELD=>9_002u64));
-            fast_field_writers.add_document(&doc!(*FIELD=>15_001u64));
-            fast_field_writers.add_document(&doc!(*FIELD=>777u64));
-            fast_field_writers.add_document(&doc!(*FIELD=>1_002u64));
-            fast_field_writers.add_document(&doc!(*FIELD=>1_501u64));
-            fast_field_writers.add_document(&doc!(*FIELD=>215u64));
-            fast_field_writers
-                .serialize(&mut serializer, &HashMap::new())
-                .unwrap();
+            add_single_field_doc(&mut fast_field_writers, *FIELD, 4u64);
+            add_single_field_doc(&mut fast_field_writers, *FIELD, 14_082_001u64);
+            add_single_field_doc(&mut fast_field_writers, *FIELD, 3_052u64);
+            add_single_field_doc(&mut fast_field_writers, *FIELD, 9002u64);
+            add_single_field_doc(&mut fast_field_writers, *FIELD, 15_001u64);
+            add_single_field_doc(&mut fast_field_writers, *FIELD, 777u64);
+            add_single_field_doc(&mut fast_field_writers, *FIELD, 1_002u64);
+            add_single_field_doc(&mut fast_field_writers, *FIELD, 1_501u64);
+            add_single_field_doc(&mut fast_field_writers, *FIELD, 215u64);
+            fast_field_writers.serialize(&mut serializer).unwrap();
            serializer.close().unwrap();
        }
        let source = directory.open_read(&path).unwrap();
        {
-            assert_eq!(source.len(), 61 as usize);
+            assert_eq!(source.len(), 60 as usize);
        }
        {
            let fast_fields_composite = CompositeFile::open(&source).unwrap();
-            let data = fast_fields_composite.open_read(*FIELD).unwrap();
-            let fast_field_reader = FastFieldReader::<u64>::open(data);
+            let fast_field_reader: U64FastFieldReader =
+                U64FastFieldReader::open(fast_fields_composite.open_read(*FIELD).unwrap());
            assert_eq!(fast_field_reader.get(0), 4u64);
            assert_eq!(fast_field_reader.get(1), 14_082_001u64);
            assert_eq!(fast_field_reader.get(2), 3_052u64);
@@ -238,21 +157,19 @@ mod tests {
            let mut serializer = FastFieldSerializer::from_write(write).unwrap();
            let mut fast_field_writers = FastFieldsWriter::from_schema(&SCHEMA);
            for _ in 0..10_000 {
-                fast_field_writers.add_document(&doc!(*FIELD=>100_000u64));
+                add_single_field_doc(&mut fast_field_writers, *FIELD, 100_000u64);
            }
-            fast_field_writers
-                .serialize(&mut serializer, &HashMap::new())
-                .unwrap();
+            fast_field_writers.serialize(&mut serializer).unwrap();
            serializer.close().unwrap();
        }
        let source = directory.open_read(&path).unwrap();
        {
-            assert_eq!(source.len(), 34 as usize);
+            assert_eq!(source.len(), 33 as usize);
        }
        {
            let fast_fields_composite = CompositeFile::open(&source).unwrap();
-            let data = fast_fields_composite.open_read(*FIELD).unwrap();
-            let fast_field_reader = FastFieldReader::<u64>::open(data);
+            let fast_field_reader: U64FastFieldReader =
+                U64FastFieldReader::open(fast_fields_composite.open_read(*FIELD).unwrap());
            for doc in 0..10_000 {
                assert_eq!(fast_field_reader.get(doc), 100_000u64);
            }
@@ -269,23 +186,26 @@ mod tests {
            let mut serializer = FastFieldSerializer::from_write(write).unwrap();
            let mut fast_field_writers = FastFieldsWriter::from_schema(&SCHEMA);
            // forcing the amplitude to be high
-            fast_field_writers.add_document(&doc!(*FIELD=>0u64));
+            add_single_field_doc(&mut fast_field_writers, *FIELD, 0u64);
            for i in 0u64..10_000u64 {
-                fast_field_writers.add_document(&doc!(*FIELD=>5_000_000_000_000_000_000u64 + i));
+                add_single_field_doc(
+                    &mut fast_field_writers,
+                    *FIELD,
+                    5_000_000_000_000_000_000u64 + i,
+                );
            }
-            fast_field_writers
-                .serialize(&mut serializer, &HashMap::new())
-                .unwrap();
+            fast_field_writers.serialize(&mut serializer).unwrap();
            serializer.close().unwrap();
        }
        let source = directory.open_read(&path).unwrap();
        {
-            assert_eq!(source.len(), 80042 as usize);
+            assert_eq!(source.len(), 80041 as usize);
        }
        {
            let fast_fields_composite = CompositeFile::open(&source).unwrap();
-            let data = fast_fields_composite.open_read(*FIELD).unwrap();
-            let fast_field_reader = FastFieldReader::<u64>::open(data);
+            let fast_field_reader: U64FastFieldReader =
+                U64FastFieldReader::open(fast_fields_composite.open_read(*FIELD).unwrap());
+
            assert_eq!(fast_field_reader.get(0), 0u64);
            for doc in 1..10_001 {
                assert_eq!(
@@ -313,19 +233,17 @@ mod tests {
                doc.add_i64(i64_field, i);
                fast_field_writers.add_document(&doc);
            }
-            fast_field_writers
-                .serialize(&mut serializer, &HashMap::new())
-                .unwrap();
+            fast_field_writers.serialize(&mut serializer).unwrap();
            serializer.close().unwrap();
        }
        let source = directory.open_read(&path).unwrap();
        {
-            assert_eq!(source.len(), 17709 as usize);
+            assert_eq!(source.len(), 17708 as usize);
        }
        {
            let fast_fields_composite = CompositeFile::open(&source).unwrap();
-            let data = fast_fields_composite.open_read(i64_field).unwrap();
-            let fast_field_reader = FastFieldReader::<i64>::open(data);
+            let fast_field_reader: I64FastFieldReader =
+                I64FastFieldReader::open(fast_fields_composite.open_read(i64_field).unwrap());

            assert_eq!(fast_field_reader.min_value(), -100i64);
            assert_eq!(fast_field_reader.max_value(), 9_999i64);
@@ -354,17 +272,15 @@ mod tests {
            let mut fast_field_writers = FastFieldsWriter::from_schema(&schema);
            let doc = Document::default();
            fast_field_writers.add_document(&doc);
-            fast_field_writers
-                .serialize(&mut serializer, &HashMap::new())
-                .unwrap();
+            fast_field_writers.serialize(&mut serializer).unwrap();
            serializer.close().unwrap();
        }

        let source = directory.open_read(&path).unwrap();
        {
            let fast_fields_composite = CompositeFile::open(&source).unwrap();
-            let data = fast_fields_composite.open_read(i64_field).unwrap();
-            let fast_field_reader = FastFieldReader::<i64>::open(data);
+            let fast_field_reader: I64FastFieldReader =
+                I64FastFieldReader::open(fast_fields_composite.open_read(i64_field).unwrap());
            assert_eq!(fast_field_reader.get(0u32), 0i64);
        }
    }
@@ -387,19 +303,17 @@ mod tests {
            let write: WritePtr = directory.open_write(Path::new("test")).unwrap();
            let mut serializer = FastFieldSerializer::from_write(write).unwrap();
            let mut fast_field_writers = FastFieldsWriter::from_schema(&SCHEMA);
-            for &x in &permutation {
-                fast_field_writers.add_document(&doc!(*FIELD=>x));
+            for x in &permutation {
+                add_single_field_doc(&mut fast_field_writers, *FIELD, *x);
            }
-            fast_field_writers
-                .serialize(&mut serializer, &HashMap::new())
-                .unwrap();
+            fast_field_writers.serialize(&mut serializer).unwrap();
            serializer.close().unwrap();
        }
        let source = directory.open_read(&path).unwrap();
        {
            let fast_fields_composite = CompositeFile::open(&source).unwrap();
-            let data = fast_fields_composite.open_read(*FIELD).unwrap();
-            let fast_field_reader = FastFieldReader::<u64>::open(data);
+            let fast_field_reader: U64FastFieldReader =
+                U64FastFieldReader::open(fast_fields_composite.open_read(*FIELD).unwrap());

            let mut a = 0u64;
            for _ in 0..n {
@@ -415,7 +329,7 @@ mod tests {
        b.iter(|| {
            let n = test::black_box(7000u32);
            let mut a = 0u64;
-            for i in Iterator::step_by(0u32..n, 7) {
+            for i in Iterator::step_by((0u32..n), 7) {
                a ^= permutation[i as usize];
            }
            a
@@ -444,24 +358,22 @@ mod tests {
            let write: WritePtr = directory.open_write(Path::new("test")).unwrap();
            let mut serializer = FastFieldSerializer::from_write(write).unwrap();
            let mut fast_field_writers = FastFieldsWriter::from_schema(&SCHEMA);
-            for &x in &permutation {
-                fast_field_writers.add_document(&doc!(*FIELD=>x));
+            for x in &permutation {
+                add_single_field_doc(&mut fast_field_writers, *FIELD, *x);
            }
-            fast_field_writers
-                .serialize(&mut serializer, &HashMap::new())
-                .unwrap();
+            fast_field_writers.serialize(&mut serializer).unwrap();
            serializer.close().unwrap();
        }
        let source = directory.open_read(&path).unwrap();
        {
            let fast_fields_composite = CompositeFile::open(&source).unwrap();
-            let data = fast_fields_composite.open_read(*FIELD).unwrap();
-            let fast_field_reader = FastFieldReader::<u64>::open(data);
+            let fast_field_reader: U64FastFieldReader =
+                U64FastFieldReader::open(fast_fields_composite.open_read(*FIELD).unwrap());

            b.iter(|| {
                let n = test::black_box(7000u32);
                let mut a = 0u64;
-                for i in Iterator::step_by(0u32..n, 7) {
+                for i in Iterator::step_by((0u32..n), 7) {
                    a ^= fast_field_reader.get(i);
                }
                a
@@ -478,19 +390,17 @@ mod tests {
            let write: WritePtr = directory.open_write(Path::new("test")).unwrap();
            let mut serializer = FastFieldSerializer::from_write(write).unwrap();
            let mut fast_field_writers = FastFieldsWriter::from_schema(&SCHEMA);
-            for &x in &permutation {
-                fast_field_writers.add_document(&doc!(*FIELD=>x));
+            for x in &permutation {
+                add_single_field_doc(&mut fast_field_writers, *FIELD, *x);
            }
-            fast_field_writers
-                .serialize(&mut serializer, &HashMap::new())
-                .unwrap();
+            fast_field_writers.serialize(&mut serializer).unwrap();
            serializer.close().unwrap();
        }
        let source = directory.open_read(&path).unwrap();
        {
            let fast_fields_composite = CompositeFile::open(&source).unwrap();
-            let data = fast_fields_composite.open_read(*FIELD).unwrap();
-            let fast_field_reader = FastFieldReader::<u64>::open(data);
+            let fast_field_reader: U64FastFieldReader =
+                U64FastFieldReader::open(fast_fields_composite.open_read(*FIELD).unwrap());

            b.iter(|| {
                let n = test::black_box(1000u32);
--- a/src/fastfield/multivalued/mod.rs
+++ b/src/fastfield/multivalued/mod.rs
@@ -1,88 +0,0 @@
-mod writer;
-mod reader;
-
-pub use self::writer::MultiValueIntFastFieldWriter;
-pub use self::reader::MultiValueIntFastFieldReader;
-
-#[cfg(test)]
-mod tests {
-
-    use schema::SchemaBuilder;
-    use schema::Cardinality;
-    use schema::IntOptions;
-    use Index;
-
-    #[test]
-    fn test_multivalued_u64() {
-        let mut schema_builder = SchemaBuilder::default();
-        let field = schema_builder.add_u64_field(
-            "multifield",
-            IntOptions::default().set_fast(Cardinality::MultiValues),
-        );
-        let schema = schema_builder.build();
-        let index = Index::create_in_ram(schema);
-        let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap();
-        index_writer.add_document(doc!(field=>1u64, field=>3u64));
-        index_writer.add_document(doc!());
-        index_writer.add_document(doc!(field=>4u64));
-        index_writer.add_document(doc!(field=>5u64, field=>20u64,field=>1u64));
-        assert!(index_writer.commit().is_ok());
-
-        index.load_searchers().unwrap();
-        let searcher = index.searcher();
-        let reader = searcher.segment_reader(0);
-        let mut vals = Vec::new();
-        let multi_value_reader = reader.multi_fast_field_reader::<u64>(field).unwrap();
-        {
-            multi_value_reader.get_vals(2, &mut vals);
-            assert_eq!(&vals, &[4u64]);
-        }
-        {
-            multi_value_reader.get_vals(0, &mut vals);
-            assert_eq!(&vals, &[1u64, 3u64]);
-        }
-        {
-            multi_value_reader.get_vals(1, &mut vals);
-            assert!(vals.is_empty());
-        }
-    }
-
-    #[test]
-    fn test_multivalued_i64() {
-        let mut schema_builder = SchemaBuilder::default();
-        let field = schema_builder.add_i64_field(
-            "multifield",
-            IntOptions::default().set_fast(Cardinality::MultiValues),
-        );
-        let schema = schema_builder.build();
-        let index = Index::create_in_ram(schema);
-        let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap();
-        index_writer.add_document(doc!(field=> 1i64, field => 3i64));
-        index_writer.add_document(doc!());
-        index_writer.add_document(doc!(field=> -4i64));
-        index_writer.add_document(doc!(field=> -5i64, field => -20i64, field=>1i64));
-        assert!(index_writer.commit().is_ok());
-
-        index.load_searchers().unwrap();
-        let searcher = index.searcher();
-        let reader = searcher.segment_reader(0);
-        let mut vals = Vec::new();
-        let multi_value_reader = reader.multi_fast_field_reader::<i64>(field).unwrap();
-        {
-            multi_value_reader.get_vals(2, &mut vals);
-            assert_eq!(&vals, &[-4i64]);
-        }
-        {
-            multi_value_reader.get_vals(0, &mut vals);
-            assert_eq!(&vals, &[1i64, 3i64]);
-        }
-        {
-            multi_value_reader.get_vals(1, &mut vals);
-            assert!(vals.is_empty());
-        }
-        {
-            multi_value_reader.get_vals(3, &mut vals);
-            assert_eq!(&vals, &[-5i64, -20i64, 1i64]);
-        }
-    }
-}
--- a/src/fastfield/multivalued/reader.rs
+++ b/src/fastfield/multivalued/reader.rs
@@ -1,109 +0,0 @@
-use DocId;
-use fastfield::{FastFieldReader, FastValue};
-
-/// Reader for a multivalued `u64` fast field.
-///
-/// The reader is implemented as two `u64` fast field.
-///
-/// The `vals_reader` will access the concatenated list of all
-/// values for all reader.
-/// The `idx_reader` associated, for each document, the index of its first value.
-///
-#[derive(Clone)]
-pub struct MultiValueIntFastFieldReader<Item: FastValue> {
-    idx_reader: FastFieldReader<u64>,
-    vals_reader: FastFieldReader<Item>,
-}
-
-impl<Item: FastValue> MultiValueIntFastFieldReader<Item> {
-    pub(crate) fn open(
-        idx_reader: FastFieldReader<u64>,
-        vals_reader: FastFieldReader<Item>,
-    ) -> MultiValueIntFastFieldReader<Item> {
-        MultiValueIntFastFieldReader {
-            idx_reader,
-            vals_reader,
-        }
-    }
-
-    /// Returns the array of values associated to the given `doc`.
-    pub fn get_vals(&self, doc: DocId, vals: &mut Vec<Item>) {
-        let start = self.idx_reader.get(doc) as u32;
-        let stop = self.idx_reader.get(doc + 1) as u32;
-        let len = (stop - start) as usize;
-        vals.resize(len, Item::default());
-        self.vals_reader.get_range(start, &mut vals[..]);
-    }
-}
-
-#[cfg(test)]
-mod tests {
-
-    use core::Index;
-    use schema::{Document, Facet, SchemaBuilder};
-
-    #[test]
-    fn test_multifastfield_reader() {
-        let mut schema_builder = SchemaBuilder::new();
-        let facet_field = schema_builder.add_facet_field("facets");
-        let schema = schema_builder.build();
-        let index = Index::create_in_ram(schema);
-        let mut index_writer = index
-            .writer_with_num_threads(1, 30_000_000)
-            .expect("Failed to create index writer.");
-        {
-            let mut doc = Document::new();
-            doc.add_facet(facet_field, "/category/cat2");
-            doc.add_facet(facet_field, "/category/cat1");
-            index_writer.add_document(doc);
-        }
-        {
-            let mut doc = Document::new();
-            doc.add_facet(facet_field, "/category/cat2");
-            index_writer.add_document(doc);
-        }
-        {
-            let mut doc = Document::new();
-            doc.add_facet(facet_field, "/category/cat3");
-            index_writer.add_document(doc);
-        }
-        index_writer.commit().expect("Commit failed");
-        index.load_searchers().expect("Reloading searchers");
-        let searcher = index.searcher();
-        let segment_reader = searcher.segment_reader(0);
-        let mut facet_reader = segment_reader.facet_reader(facet_field).unwrap();
-
-        let mut facet = Facet::root();
-        {
-            facet_reader.facet_from_ord(1, &mut facet);
-            assert_eq!(facet, Facet::from("/category"));
-        }
-        {
-            facet_reader.facet_from_ord(2, &mut facet);
-            assert_eq!(facet, Facet::from("/category/cat1"));
-        }
-        {
-            facet_reader.facet_from_ord(3, &mut facet);
-            assert_eq!(format!("{}", facet), "/category/cat2");
-            assert_eq!(facet, Facet::from("/category/cat2"));
-        }
-        {
-            facet_reader.facet_from_ord(4, &mut facet);
-            assert_eq!(facet, Facet::from("/category/cat3"));
-        }
-
-        let mut vals = Vec::new();
-        {
-            facet_reader.facet_ords(0, &mut vals);
-            assert_eq!(&vals[..], &[3, 2]);
-        }
-        {
-            facet_reader.facet_ords(1, &mut vals);
-            assert_eq!(&vals[..], &[3]);
-        }
-        {
-            facet_reader.facet_ords(2, &mut vals);
-            assert_eq!(&vals[..], &[4]);
-        }
-    }
-}
--- a/src/fastfield/multivalued/writer.rs
+++ b/src/fastfield/multivalued/writer.rs
@@ -1,111 +0,0 @@
-use fastfield::FastFieldSerializer;
-use fastfield::serializer::FastSingleFieldSerializer;
-use fastfield::value_to_u64;
-use std::collections::HashMap;
-use postings::UnorderedTermId;
-use schema::{Document, Field};
-use std::io;
-use itertools::Itertools;
-
-pub struct MultiValueIntFastFieldWriter {
-    field: Field,
-    vals: Vec<u64>,
-    doc_index: Vec<u64>,
-    is_facet: bool,
-}
-
-impl MultiValueIntFastFieldWriter {
-    /// Creates a new `IntFastFieldWriter`
-    pub fn new(field: Field, is_facet: bool) -> Self {
-        MultiValueIntFastFieldWriter {
-            field,
-            vals: Vec::new(),
-            doc_index: Vec::new(),
-            is_facet,
-        }
-    }
-
-    pub fn field(&self) -> Field {
-        self.field
-    }
-
-    pub fn next_doc(&mut self) {
-        self.doc_index.push(self.vals.len() as u64);
-    }
-
-    /// Records a new value.
-    ///
-    /// The n-th value being recorded is implicitely
-    /// associated to the document with the `DocId` n.
-    /// (Well, `n-1` actually because of 0-indexing)
-    pub fn add_val(&mut self, val: UnorderedTermId) {
-        self.vals.push(val);
-    }
-
-    pub fn add_document(&mut self, doc: &Document) {
-        if !self.is_facet {
-            for field_value in doc.field_values() {
-                if field_value.field() == self.field {
-                    self.add_val(value_to_u64(field_value.value()));
-                }
-            }
-        }
-    }
-
-    /// Serializes fast field values by pushing them to the `FastFieldSerializer`.
-    ///
-    /// HashMap makes it possible to remap them before serializing.
-    /// Specifically, string terms are first stored in the writer as their
-    /// position in the `IndexWriter`'s `HashMap`. This value is called
-    /// an `UnorderedTermId`.
-    ///
-    /// During the serialization of the segment, terms gets sorted and
-    /// `tantivy` builds a mapping to convert this `UnorderedTermId` into
-    /// term ordinals.
-    ///
-    pub fn serialize(
-        &self,
-        serializer: &mut FastFieldSerializer,
-        mapping_opt: Option<&HashMap<UnorderedTermId, usize>>,
-    ) -> io::Result<()> {
-        {
-            // writing the offset index
-            let mut doc_index_serializer =
-                serializer.new_u64_fast_field_with_idx(self.field, 0, self.vals.len() as u64, 0)?;
-            for &offset in &self.doc_index {
-                doc_index_serializer.add_val(offset)?;
-            }
-            doc_index_serializer.add_val(self.vals.len() as u64)?;
-            doc_index_serializer.close_field()?;
-        }
-        {
-            // writing the values themselves.
-            let mut value_serializer: FastSingleFieldSerializer<_>;
-            match mapping_opt {
-                Some(mapping) => {
-                    value_serializer = serializer.new_u64_fast_field_with_idx(
-                        self.field,
-                        0u64,
-                        mapping.len() as u64,
-                        1,
-                    )?;
-                    for val in &self.vals {
-                        let remapped_val = *mapping.get(val).expect("Missing term ordinal") as u64;
-                        value_serializer.add_val(remapped_val)?;
-                    }
-                }
-                None => {
-                    let val_min_max = self.vals.iter().cloned().minmax();
-                    let (val_min, val_max) = val_min_max.into_option().unwrap_or((0u64, 0));
-                    value_serializer =
-                        serializer.new_u64_fast_field_with_idx(self.field, val_min, val_max, 1)?;
-                    for &val in &self.vals {
-                        value_serializer.add_val(val)?;
-                    }
-                }
-            }
-            value_serializer.close_field()?;
-        }
-        Ok(())
-    }
-}
--- a/src/fastfield/reader.rs
+++ b/src/fastfield/reader.rs
@@ -1,35 +1,107 @@
-use common::BinarySerializable;
-use common::bitpacker::BitUnpacker;
-use common::CompositeFile;
-use common::compute_num_bits;
-use directory::{Directory, RAMDirectory, WritePtr};
 use directory::ReadOnlySource;
+use common::{self, BinarySerializable};
+use common::bitpacker::{compute_num_bits, BitUnpacker};
 use DocId;
-use fastfield::{FastFieldSerializer, FastFieldsWriter};
-use owning_ref::OwningRef;
-use schema::FAST;
 use schema::SchemaBuilder;
-use std::collections::HashMap;
-use std::marker::PhantomData;
-use std::mem;
 use std::path::Path;
-use super::FastValue;
+use schema::FAST;
+use directory::{Directory, RAMDirectory, WritePtr};
+use fastfield::{FastFieldSerializer, FastFieldsWriter};
+use schema::FieldType;
+use std::mem;
+use common::CompositeFile;
+use owning_ref::OwningRef;

 /// Trait for accessing a fastfield.
 ///
 /// Depending on the field type, a different
 /// fast field is required.
-#[derive(Clone)]
-pub struct FastFieldReader<Item: FastValue> {
-    bit_unpacker: BitUnpacker<OwningRef<ReadOnlySource, [u8]>>,
-    min_value_u64: u64,
-    max_value_u64: u64,
-    _phantom: PhantomData<Item>,
+pub trait FastFieldReader: Sized {
+    /// Type of the value stored in the fastfield.
+    type ValueType;
+
+    /// Return the value associated to the given document.
+    ///
+    /// This accessor should return as fast as possible.
+    ///
+    /// # Panics
+    ///
+    /// May panic if `doc` is greater than the segment
+    // `maxdoc`.
+    fn get(&self, doc: DocId) -> Self::ValueType;
+
+    /// Fills an output buffer with the fast field values
+    /// associated with the `DocId` going from
+    /// `start` to `start + output.len()`.
+    ///
+    /// # Panics
+    ///
+    /// May panic if `start + output.len()` is greater than
+    /// the segment's `maxdoc`.
+    fn get_range(&self, start: u32, output: &mut [Self::ValueType]);
+
+    /// Opens a fast field given a source.
+    fn open(source: ReadOnlySource) -> Self;
+
+    /// Returns true iff the given field_type makes
+    /// it possible to access the field values via a
+    /// fastfield.
+    fn is_enabled(field_type: &FieldType) -> bool;
 }

-impl<Item: FastValue> FastFieldReader<Item> {
-    /// Opens a fast field given a source.
-    pub fn open(data: ReadOnlySource) -> Self {
+/// `FastFieldReader` for unsigned 64-bits integers.
+pub struct U64FastFieldReader {
+    bit_unpacker: BitUnpacker<OwningRef<ReadOnlySource, [u8]>>,
+    min_value: u64,
+    max_value: u64,
+}
+
+impl U64FastFieldReader {
+    /// Returns the minimum value for this fast field.
+    ///
+    /// The min value does not take in account of possible
+    /// deleted document, and should be considered as a lower bound
+    /// of the actual minimum value.
+    pub fn min_value(&self) -> u64 {
+        self.min_value
+    }
+
+    /// Returns the maximum value for this fast field.
+    ///
+    /// The max value does not take in account of possible
+    /// deleted document, and should be considered as an upper bound
+    /// of the actual maximum value.
+    pub fn max_value(&self) -> u64 {
+        self.max_value
+    }
+}
+
+impl FastFieldReader for U64FastFieldReader {
+    type ValueType = u64;
+
+    fn get(&self, doc: DocId) -> u64 {
+        self.min_value + self.bit_unpacker.get(doc as usize)
+    }
+
+    fn is_enabled(field_type: &FieldType) -> bool {
+        match *field_type {
+            FieldType::U64(ref integer_options) => integer_options.is_fast(),
+            _ => false,
+        }
+    }
+
+    fn get_range(&self, start: u32, output: &mut [Self::ValueType]) {
+        self.bit_unpacker.get_range(start, output);
+        for out in output.iter_mut() {
+            *out += self.min_value;
+        }
+    }
+
+    /// Opens a new fast field reader given a read only source.
+    ///
+    /// # Panics
+    /// Panics if the data is corrupted.
+    fn open(data: ReadOnlySource) -> U64FastFieldReader {
        let min_value: u64;
        let amplitude: u64;
        {
@@ -42,64 +114,17 @@ impl<Item: FastValue> FastFieldReader<Item> {
        let max_value = min_value + amplitude;
        let num_bits = compute_num_bits(amplitude);
        let owning_ref = OwningRef::new(data).map(|data| &data[16..]);
-        let bit_unpacker = BitUnpacker::new(owning_ref, num_bits);
-        FastFieldReader {
-            min_value_u64: min_value,
-            max_value_u64: max_value,
-            bit_unpacker,
-            _phantom: PhantomData,
+        let bit_unpacker = BitUnpacker::new(owning_ref, num_bits as usize);
+        U64FastFieldReader {
+            min_value: min_value,
+            max_value: max_value,
+            bit_unpacker: bit_unpacker,
        }
    }
-
-    /// Return the value associated to the given document.
-    ///
-    /// This accessor should return as fast as possible.
-    ///
-    /// # Panics
-    ///
-    /// May panic if `doc` is greater than the segment
-    // `maxdoc`.
-    pub fn get(&self, doc: DocId) -> Item {
-        Item::from_u64(self.min_value_u64 + self.bit_unpacker.get(doc as usize))
-    }
-
-    /// Fills an output buffer with the fast field values
-    /// associated with the `DocId` going from
-    /// `start` to `start + output.len()`.
-    ///
-    /// # Panics
-    ///
-    /// May panic if `start + output.len()` is greater than
-    /// the segment's `maxdoc`.
-    pub fn get_range(&self, start: u32, output: &mut [Item]) {
-        let output_u64: &mut [u64] = unsafe { mem::transmute(output) };
-        self.bit_unpacker.get_range(start, output_u64);
-        for out in output_u64.iter_mut() {
-            *out = Item::from_u64(*out + self.min_value_u64).as_u64();
-        }
-    }
-
-    /// Returns the minimum value for this fast field.
-    ///
-    /// The max value does not take in account of possible
-    /// deleted document, and should be considered as an upper bound
-    /// of the actual maximum value.
-    pub fn min_value(&self) -> Item {
-        Item::from_u64(self.min_value_u64)
-    }
-
-    /// Returns the maximum value for this fast field.
-    ///
-    /// The max value does not take in account of possible
-    /// deleted document, and should be considered as an upper bound
-    /// of the actual maximum value.
-    pub fn max_value(&self) -> Item {
-        Item::from_u64(self.max_value_u64)
-    }
 }

-impl<Item: FastValue> From<Vec<Item>> for FastFieldReader<Item> {
-    fn from(vals: Vec<Item>) -> FastFieldReader<Item> {
+impl From<Vec<u64>> for U64FastFieldReader {
+    fn from(vals: Vec<u64>) -> U64FastFieldReader {
        let mut schema_builder = SchemaBuilder::default();
        let field = schema_builder.add_u64_field("field", FAST);
        let schema = schema_builder.build();
@@ -117,21 +142,89 @@ impl<Item: FastValue> From<Vec<Item>> for FastFieldReader<Item> {
                    .get_field_writer(field)
                    .expect("With a RAMDirectory, this should never fail.");
                for val in vals {
-                    fast_field_writer.add_val(val.to_u64());
+                    fast_field_writer.add_val(val);
                }
            }
-            fast_field_writers
-                .serialize(&mut serializer, &HashMap::new())
-                .unwrap();
+            fast_field_writers.serialize(&mut serializer).unwrap();
            serializer.close().unwrap();
        }

        let source = directory.open_read(path).expect("Failed to open the file");
        let composite_file =
            CompositeFile::open(&source).expect("Failed to read the composite file");
+
        let field_source = composite_file
            .open_read(field)
            .expect("File component not found");
-        FastFieldReader::open(field_source)
+        U64FastFieldReader::open(field_source)
+    }
+}
+
+/// `FastFieldReader` for signed 64-bits integers.
+pub struct I64FastFieldReader {
+    underlying: U64FastFieldReader,
+}
+
+impl I64FastFieldReader {
+    /// Returns the minimum value for this fast field.
+    ///
+    /// The min value does not take in account of possible
+    /// deleted document, and should be considered as a lower bound
+    /// of the actual minimum value.
+    pub fn min_value(&self) -> i64 {
+        common::u64_to_i64(self.underlying.min_value())
+    }
+
+    /// Returns the maximum value for this fast field.
+    ///
+    /// The max value does not take in account of possible
+    /// deleted document, and should be considered as an upper bound
+    /// of the actual maximum value.
+    pub fn max_value(&self) -> i64 {
+        common::u64_to_i64(self.underlying.max_value())
+    }
+}
+
+impl FastFieldReader for I64FastFieldReader {
+    type ValueType = i64;
+
+    ///
+    ///
+    /// # Panics
+    ///
+    /// May panic or return wrong random result if `doc`
+    /// is greater or equal to the segment's `maxdoc`.
+    fn get(&self, doc: DocId) -> i64 {
+        common::u64_to_i64(self.underlying.get(doc))
+    }
+
+    ///
+    /// # Panics
+    ///
+    /// May panic or return wrong random result if `doc`
+    /// is greater or equal to the segment's `maxdoc`.
+    fn get_range(&self, start: u32, output: &mut [Self::ValueType]) {
+        let output_u64: &mut [u64] = unsafe { mem::transmute(output) };
+        self.underlying.get_range(start, output_u64);
+        for mut_val in output_u64.iter_mut() {
+            *mut_val = common::u64_to_i64(*mut_val as u64) as u64;
+        }
+    }
+
+    /// Opens a new fast field reader given a read only source.
+    ///
+    /// # Panics
+    /// Panics if the data is corrupted.
+    fn open(data: ReadOnlySource) -> I64FastFieldReader {
+        I64FastFieldReader {
+            underlying: U64FastFieldReader::open(data),
+        }
+    }
+
+    fn is_enabled(field_type: &FieldType) -> bool {
+        match *field_type {
+            FieldType::I64(ref integer_options) => integer_options.is_fast(),
+            _ => false,
+        }
    }
 }
--- a/src/fastfield/serializer.rs
+++ b/src/fastfield/serializer.rs
@@ -1,8 +1,7 @@
 use common::BinarySerializable;
 use directory::WritePtr;
 use schema::Field;
-use common::bitpacker::BitPacker;
-use common::compute_num_bits;
+use common::bitpacker::{compute_num_bits, BitPacker};
 use common::CountingWriter;
 use common::CompositeWrite;
 use std::io::{self, Write};
@@ -46,18 +45,7 @@ impl FastFieldSerializer {
        min_value: u64,
        max_value: u64,
    ) -> io::Result<FastSingleFieldSerializer<CountingWriter<WritePtr>>> {
-        self.new_u64_fast_field_with_idx(field, min_value, max_value, 0)
-    }
-
-    /// Start serializing a new u64 fast field
-    pub fn new_u64_fast_field_with_idx(
-        &mut self,
-        field: Field,
-        min_value: u64,
-        max_value: u64,
-        idx: usize,
-    ) -> io::Result<FastSingleFieldSerializer<CountingWriter<WritePtr>>> {
-        let field_write = self.composite_write.for_field_with_idx(field, idx);
+        let field_write = self.composite_write.for_field(field);
        FastSingleFieldSerializer::open(field_write, min_value, max_value)
    }

@@ -73,7 +61,6 @@ pub struct FastSingleFieldSerializer<'a, W: Write + 'a> {
    bit_packer: BitPacker,
    write: &'a mut W,
    min_value: u64,
-    num_bits: u8,
 }

 impl<'a, W: Write> FastSingleFieldSerializer<'a, W> {
@@ -86,20 +73,18 @@ impl<'a, W: Write> FastSingleFieldSerializer<'a, W> {
        let amplitude = max_value - min_value;
        amplitude.serialize(write)?;
        let num_bits = compute_num_bits(amplitude);
-        let bit_packer = BitPacker::new();
+        let bit_packer = BitPacker::new(num_bits as usize);
        Ok(FastSingleFieldSerializer {
            write,
            bit_packer,
            min_value,
-            num_bits,
        })
    }

    /// Pushes a new value to the currently open u64 fast field.
    pub fn add_val(&mut self, val: u64) -> io::Result<()> {
        let val_to_write: u64 = val - self.min_value;
-        self.bit_packer
-            .write(val_to_write, self.num_bits, &mut self.write)?;
+        self.bit_packer.write(val_to_write, &mut self.write)?;
        Ok(())
    }

--- a/src/fastfield/writer.rs
+++ b/src/fastfield/writer.rs
@@ -1,111 +1,93 @@
-use schema::{Cardinality, Document, Field, Schema};
+use schema::{Document, Field, Schema};
 use fastfield::FastFieldSerializer;
 use std::io;
+use schema::Value;
+use DocId;
 use schema::FieldType;
 use common;
 use common::VInt;
-use std::collections::HashMap;
-use postings::UnorderedTermId;
-use super::multivalued::MultiValueIntFastFieldWriter;
 use common::BinarySerializable;

 /// The fastfieldswriter regroup all of the fast field writers.
 pub struct FastFieldsWriter {
-    single_value_writers: Vec<IntFastFieldWriter>,
-    multi_values_writers: Vec<MultiValueIntFastFieldWriter>,
+    field_writers: Vec<IntFastFieldWriter>,
 }

 impl FastFieldsWriter {
    /// Create all `FastFieldWriter` required by the schema.
    pub fn from_schema(schema: &Schema) -> FastFieldsWriter {
-        let mut single_value_writers = Vec::new();
-        let mut multi_values_writers = Vec::new();
-
-        for (field_id, field_entry) in schema.fields().iter().enumerate() {
-            let field = Field(field_id as u32);
-            let default_value = if let FieldType::I64(_) = *field_entry.field_type() {
-                common::i64_to_u64(0i64)
-            } else {
-                0u64
-            };
-            match *field_entry.field_type() {
-                FieldType::I64(ref int_options) | FieldType::U64(ref int_options) => {
-                    match int_options.get_fastfield_cardinality() {
-                        Some(Cardinality::SingleValue) => {
+        let field_writers: Vec<IntFastFieldWriter> = schema
+            .fields()
+            .iter()
+            .enumerate()
+            .flat_map(|(field_id, field_entry)| {
+                let field = Field(field_id as u32);
+                match *field_entry.field_type() {
+                    FieldType::I64(ref int_options) => {
+                        if int_options.is_fast() {
                            let mut fast_field_writer = IntFastFieldWriter::new(field);
-                            fast_field_writer.set_val_if_missing(default_value);
-                            single_value_writers.push(fast_field_writer);
+                            fast_field_writer.set_val_if_missing(common::i64_to_u64(0i64));
+                            Some(fast_field_writer)
+                        } else {
+                            None
                        }
-                        Some(Cardinality::MultiValues) => {
-                            let fast_field_writer = MultiValueIntFastFieldWriter::new(field, false);
-                            multi_values_writers.push(fast_field_writer);
-                        }
-                        None => {}
                    }
+                    FieldType::U64(ref int_options) => {
+                        if int_options.is_fast() {
+                            Some(IntFastFieldWriter::new(field))
+                        } else {
+                            None
+                        }
+                    }
+                    _ => None,
                }
-                FieldType::HierarchicalFacet => {
-                    let fast_field_writer = MultiValueIntFastFieldWriter::new(field, true);
-                    multi_values_writers.push(fast_field_writer);
-                }
-                _ => {}
-            }
-        }
+            })
+            .collect();
+        FastFieldsWriter { field_writers }
+    }
+
+    /// Returns a `FastFieldsWriter`
+    /// with a `IntFastFieldWriter` for each
+    /// of the field given in argument.
+    pub fn new(fields: Vec<Field>) -> FastFieldsWriter {
        FastFieldsWriter {
-            single_value_writers,
-            multi_values_writers,
+            field_writers: fields.into_iter().map(IntFastFieldWriter::new).collect(),
        }
    }

    /// Get the `FastFieldWriter` associated to a field.
    pub fn get_field_writer(&mut self, field: Field) -> Option<&mut IntFastFieldWriter> {
        // TODO optimize
-        self.single_value_writers
+        self.field_writers
            .iter_mut()
-            .find(|field_writer| field_writer.field() == field)
-    }
-
-    /// Returns the fast field multi-value writer for the given field.
-    ///
-    /// Returns None if the field does not exist, or is not
-    /// configured as a multivalued fastfield in the schema.
-    pub(crate) fn get_multivalue_writer(
-        &mut self,
-        field: Field,
-    ) -> Option<&mut MultiValueIntFastFieldWriter> {
-        // TODO optimize
-        // TODO expose for users
-        self.multi_values_writers
-            .iter_mut()
-            .find(|multivalue_writer| multivalue_writer.field() == field)
+            .find(|field_writer| field_writer.field == field)
    }

    /// Indexes all of the fastfields of a new document.
    pub fn add_document(&mut self, doc: &Document) {
-        for field_writer in &mut self.single_value_writers {
-            field_writer.add_document(doc);
-        }
-        for field_writer in &mut self.multi_values_writers {
-            field_writer.next_doc();
+        for field_writer in &mut self.field_writers {
            field_writer.add_document(doc);
        }
    }

    /// Serializes all of the `FastFieldWriter`s by pushing them in
    /// order to the fast field serializer.
-    pub fn serialize(
-        &self,
-        serializer: &mut FastFieldSerializer,
-        mapping: &HashMap<Field, HashMap<UnorderedTermId, usize>>,
-    ) -> io::Result<()> {
-        for field_writer in &self.single_value_writers {
+    pub fn serialize(&self, serializer: &mut FastFieldSerializer) -> io::Result<()> {
+        for field_writer in &self.field_writers {
            field_writer.serialize(serializer)?;
        }
-        for field_writer in &self.multi_values_writers {
-            let field = field_writer.field();
-            field_writer.serialize(serializer, mapping.get(&field))?;
-        }
        Ok(())
    }
+
+    /// Ensures all of the fast field writers have
+    /// reached `doc`. (included)
+    ///
+    /// The missing values will be filled with 0.
+    pub fn fill_val_up_to(&mut self, doc: DocId) {
+        for field_writer in &mut self.field_writers {
+            field_writer.fill_val_up_to(doc);
+        }
+    }
 }

 /// Fast field writer for ints.
@@ -145,11 +127,6 @@ impl IntFastFieldWriter {
        }
    }

-    /// Returns the field that this writer is targetting.
-    pub fn field(&self) -> Field {
-        self.field
-    }
-
    /// Sets the default value.
    ///
    /// This default value is recorded for documents if
@@ -158,6 +135,19 @@ impl IntFastFieldWriter {
        self.val_if_missing = val_if_missing;
    }

+    /// Ensures all of the fast field writer have
+    /// reached `doc`. (included)
+    ///
+    /// The missing values will be filled with 0.
+    fn fill_val_up_to(&mut self, doc: DocId) {
+        let target = doc as usize + 1;
+        debug_assert!(self.val_count <= target);
+        let val_if_missing = self.val_if_missing;
+        while self.val_count < target {
+            self.add_val(val_if_missing);
+        }
+    }
+
    /// Records a new value.
    ///
    /// The n-th value being recorded is implicitely
@@ -190,7 +180,11 @@ impl IntFastFieldWriter {
    /// only the first one is taken in account.
    fn extract_val(&self, doc: &Document) -> u64 {
        match doc.get_first(self.field) {
-            Some(v) => super::value_to_u64(v),
+            Some(v) => match *v {
+                Value::U64(ref val) => *val,
+                Value::I64(ref val) => common::i64_to_u64(*val),
+                _ => panic!("Expected a u64field, got {:?} ", v),
+            },
            None => self.val_if_missing,
        }
    }
--- a/Show More
+++ b/Show More
Author	SHA1	Message	Date
Paul Masurel	1658be3792	Various changes. Need to cherrypick some of them and put them into master	2017-12-25 10:35:10 +09:00
Paul Masurel	23fad88b35	NOBUG common crawl, streamdict works with 64 bits (hopefully)	2017-12-21 22:44:50 +09:00