tracing

add support for exists query syntax in query parser (#2170 )
* add support for exists query syntax in query parser * rustfmt * make Exists require a field
2025-12-27 20:42:54 +00:00 · 2023-10-16 19:23:47 +09:00 · 2023-09-19 11:10:39 +02:00 · 2023-09-19 08:21:50 +02:00 · 2023-09-14 09:22:18 +02:00 · 2023-09-13 07:38:34 +02:00
367 changed files with 32062 additions and 22711 deletions
--- a/.github/workflows/coverage.yml
+++ b/.github/workflows/coverage.yml
@@ -2,21 +2,26 @@ name: Coverage

 on:
  push:
-    branches: [ main ]
+    branches: [main]
  pull_request:
-    branches: [ main ]
+    branches: [main]
+
+# Ensures that we cancel running jobs for the same PR / same workflow.
+concurrency:
+  group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}
+  cancel-in-progress: true

 jobs:
  coverage:
    runs-on: ubuntu-latest
    steps:
-      - uses: actions/checkout@v3
+      - uses: actions/checkout@v4
      - name: Install Rust
        run: rustup toolchain install nightly --profile minimal --component llvm-tools-preview
      - uses: Swatinem/rust-cache@v2
      - uses: taiki-e/install-action@cargo-llvm-cov
      - name: Generate code coverage
-        run: cargo +nightly llvm-cov --all-features --workspace --lcov --output-path lcov.info
+        run: cargo +nightly llvm-cov --all-features --workspace --doctests --lcov --output-path lcov.info
      - name: Upload coverage to Codecov
        uses: codecov/codecov-action@v3
        continue-on-error: true
--- a/.github/workflows/long_running.yml
+++ b/.github/workflows/long_running.yml
@@ -8,13 +8,18 @@ env:
  CARGO_TERM_COLOR: always
  NUM_FUNCTIONAL_TEST_ITERATIONS: 20000

+# Ensures that we cancel running jobs for the same PR / same workflow.
+concurrency:
+  group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}
+  cancel-in-progress: true
+
 jobs:
  test:

    runs-on: ubuntu-latest

    steps:
-    - uses: actions/checkout@v3
+    - uses: actions/checkout@v4
    - name: Install stable
      uses: actions-rs/toolchain@v1
      with:
--- a/.github/workflows/test.yml
+++ b/.github/workflows/test.yml
@@ -9,13 +9,18 @@ on:
 env:
  CARGO_TERM_COLOR: always

+# Ensures that we cancel running jobs for the same PR / same workflow.
+concurrency:
+  group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}
+  cancel-in-progress: true
+
 jobs:
  check:

    runs-on: ubuntu-latest

    steps:
-    - uses: actions/checkout@v3
+    - uses: actions/checkout@v4

    - name: Install nightly
      uses: actions-rs/toolchain@v1
@@ -48,14 +53,14 @@ jobs:
    strategy:
      matrix:
        features: [
-            { label: "all", flags: "mmap,stopwords,brotli-compression,lz4-compression,snappy-compression,zstd-compression,failpoints" },
+            { label: "all", flags: "mmap,stopwords,lz4-compression,zstd-compression,failpoints" },
            { label: "quickwit", flags: "mmap,quickwit,failpoints" }
        ]

    name: test-${{ matrix.features.label}}

    steps:
-    - uses: actions/checkout@v3
+    - uses: actions/checkout@v4

    - name: Install stable
      uses: actions-rs/toolchain@v1
--- a/.gitignore
+++ b/.gitignore
@@ -13,3 +13,5 @@ benchmark
 .idea
 trace.dat
 cargo-timing*
+control
+variable
--- a/ARCHITECTURE.md
+++ b/ARCHITECTURE.md
@@ -254,7 +254,7 @@ The token positions of all of the terms are then stored in a separate file with
 The [TermInfo](src/postings/term_info.rs) gives an offset (expressed in position this time) in this file. As we iterate through the docset,
 we advance the position reader by the number of term frequencies of the current document.

-## [fieldnorms/](src/fieldnorms): Here is my doc, how many tokens in this field?
+## [fieldnorm/](src/fieldnorm): Here is my doc, how many tokens in this field?

 The [BM25](https://en.wikipedia.org/wiki/Okapi_BM25) formula also requires to know the number of tokens stored in a specific field for a given document. We store this information on one byte per document in the fieldnorm.
 The fieldnorm is therefore compressed. Values up to 40 are encoded unchanged.
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,3 +1,119 @@
+Tantivy 0.21
+================================
+#### Bugfixes
+- Fix track fast field memory consumption, which led to higher memory consumption than the budget allowed during indexing [#2148](https://github.com/quickwit-oss/tantivy/issues/2148)[#2147](https://github.com/quickwit-oss/tantivy/issues/2147)(@PSeitz)
+- Fix a regression from 0.20 where sort index by date wasn't working anymore [#2124](https://github.com/quickwit-oss/tantivy/issues/2124)(@PSeitz)
+- Fix getting the root facet on the `FacetCollector`. [#2086](https://github.com/quickwit-oss/tantivy/issues/2086)(@adamreichold)
+- Align numerical type priority order of columnar and query. [#2088](https://github.com/quickwit-oss/tantivy/issues/2088)(@fmassot)
+#### Breaking Changes
+- Remove support for Brotli and Snappy compression [#2123](https://github.com/quickwit-oss/tantivy/issues/2123)(@adamreichold)
+#### Features/Improvements
+- Implement lenient query parser [#2129](https://github.com/quickwit-oss/tantivy/pull/2129)(@trinity-1686a)
+- order_by_u64_field and order_by_fast_field allow sorting in ascending and descending order [#2111](https://github.com/quickwit-oss/tantivy/issues/2111)(@naveenann)
+- Allow dynamic filters in text analyzer builder [#2110](https://github.com/quickwit-oss/tantivy/issues/2110)(@fulmicoton @fmassot)
+- **Aggregation**
+  - Add missing parameter for term aggregation [#2149](https://github.com/quickwit-oss/tantivy/issues/2149)[#2103](https://github.com/quickwit-oss/tantivy/issues/2103)(@PSeitz)
+  - Add missing parameter for percentiles [#2157](https://github.com/quickwit-oss/tantivy/issues/2157)(@PSeitz)
+  - Add missing parameter for stats,min,max,count,sum,avg [#2151](https://github.com/quickwit-oss/tantivy/issues/2151)(@PSeitz)
+  - Improve aggregation deserialization error message [#2150](https://github.com/quickwit-oss/tantivy/issues/2150)(@PSeitz)
+  - Add validation for type Bytes to term_agg [#2077](https://github.com/quickwit-oss/tantivy/issues/2077)(@PSeitz)
+  - Alternative mixed field collection [#2135](https://github.com/quickwit-oss/tantivy/issues/2135)(@PSeitz)
+- Add missing query_terms impl for TermSetQuery. [#2120](https://github.com/quickwit-oss/tantivy/issues/2120)(@adamreichold)
+- Minor improvements to OwnedBytes [#2134](https://github.com/quickwit-oss/tantivy/issues/2134)(@adamreichold)
+- Remove allocations in split compound words [#2080](https://github.com/quickwit-oss/tantivy/issues/2080)(@PSeitz)
+- Ngram tokenizer now returns an error with invalid arguments [#2102](https://github.com/quickwit-oss/tantivy/issues/2102)(@fmassot)
+- Make TextAnalyzerBuilder public [#2097](https://github.com/quickwit-oss/tantivy/issues/2097)(@adamreichold)
+- Return an error when tokenizer is not found while indexing [#2093](https://github.com/quickwit-oss/tantivy/issues/2093)(@naveenann)
+- Delayed column opening during merge [#2132](https://github.com/quickwit-oss/tantivy/issues/2132)(@PSeitz)
+
+Tantivy 0.20.2
+================================
+- Align numerical type priority order on the search side.  [#2088](https://github.com/quickwit-oss/tantivy/issues/2088) (@fmassot)
+- Fix is_child_of function not considering the root facet. [#2086](https://github.com/quickwit-oss/tantivy/issues/2086) (@adamreichhold)
+
+Tantivy 0.20.1
+================================
+- Fix building on windows with mmap [#2070](https://github.com/quickwit-oss/tantivy/issues/2070) (@ChillFish8)
+
+Tantivy 0.20
+================================
+#### Bugfixes
+- Fix phrase queries with slop (slop supports now transpositions, algorithm that carries slop so far for num terms > 2) [#2031](https://github.com/quickwit-oss/tantivy/issues/2031)[#2020](https://github.com/quickwit-oss/tantivy/issues/2020)(@PSeitz)
+- Handle error for exists on MMapDirectory [#1988](https://github.com/quickwit-oss/tantivy/issues/1988) (@PSeitz)
+- Aggregation
+  - Fix min doc_count empty merge bug [#2057](https://github.com/quickwit-oss/tantivy/issues/2057) (@PSeitz)
+  - Fix: Sort order for term aggregations (sort order on key was inverted) [#1858](https://github.com/quickwit-oss/tantivy/issues/1858) (@PSeitz)
+
+#### Features/Improvements
+- Add PhrasePrefixQuery [#1842](https://github.com/quickwit-oss/tantivy/issues/1842) (@trinity-1686a)
+- Add `coerce` option for text and numbers types (convert the value instead of returning an error during indexing) [#1904](https://github.com/quickwit-oss/tantivy/issues/1904) (@PSeitz)
+- Add regex tokenizer [#1759](https://github.com/quickwit-oss/tantivy/issues/1759)(@mkleen)
+- Move tokenizer API to seperate crate. Having a seperate crate with a stable API will allow us to use tokenizers with different tantivy versions. [#1767](https://github.com/quickwit-oss/tantivy/issues/1767) (@PSeitz)
+- **Columnar crate**: New fast field handling (@fulmicoton @PSeitz) [#1806](https://github.com/quickwit-oss/tantivy/issues/1806)[#1809](https://github.com/quickwit-oss/tantivy/issues/1809)
+  - Support for fast fields with optional values. Previously tantivy supported only single-valued and multi-value fast fields. The encoding of optional fast fields is now very compact.
+  - Fast field Support for JSON (schemaless fast fields). Support multiple types on the same column. [#1876](https://github.com/quickwit-oss/tantivy/issues/1876) (@fulmicoton)
+  - Unified access for fast fields over different cardinalities.
+  - Unified storage for typed and untyped fields.
+  - Move fastfield codecs into columnar. [#1782](https://github.com/quickwit-oss/tantivy/issues/1782) (@fulmicoton)
+  - Sparse dense index for optional values [#1716](https://github.com/quickwit-oss/tantivy/issues/1716) (@PSeitz)
+  - Switch to nanosecond precision in DateTime fastfield [#2016](https://github.com/quickwit-oss/tantivy/issues/2016) (@PSeitz)
+- **Aggregation**
+  - Add `date_histogram` aggregation (only `fixed_interval` for now) [#1900](https://github.com/quickwit-oss/tantivy/issues/1900) (@PSeitz)
+  - Add `percentiles` aggregations [#1984](https://github.com/quickwit-oss/tantivy/issues/1984) (@PSeitz)
+  - [**breaking**] Drop JSON support on intermediate agg result (we use postcard as format in `quickwit` to send intermediate results) [#1992](https://github.com/quickwit-oss/tantivy/issues/1992) (@PSeitz)
+  - Set memory limit in bytes for aggregations after which they abort (Previously there was only the bucket limit) [#1942](https://github.com/quickwit-oss/tantivy/issues/1942)[#1957](https://github.com/quickwit-oss/tantivy/issues/1957)(@PSeitz)
+  - Add support for u64,i64,f64 fields in term aggregation [#1883](https://github.com/quickwit-oss/tantivy/issues/1883) (@PSeitz)
+  - Allow histogram bounds to be passed as Rfc3339 [#2076](https://github.com/quickwit-oss/tantivy/issues/2076) (@PSeitz)
+  - Add count, min, max, and sum aggregations [#1794](https://github.com/quickwit-oss/tantivy/issues/1794) (@guilload)
+  - Switch to Aggregation without serde_untagged => better deserialization errors. [#2003](https://github.com/quickwit-oss/tantivy/issues/2003) (@PSeitz)
+  - Switch to ms in histogram for date type (ES compatibility) [#2045](https://github.com/quickwit-oss/tantivy/issues/2045) (@PSeitz)
+  - Reduce term aggregation memory consumption [#2013](https://github.com/quickwit-oss/tantivy/issues/2013) (@PSeitz)
+  - Reduce agg memory consumption: Replace generic aggregation collector (which has a high memory requirement per instance) in aggregation tree with optimized versions behind a trait.
+  - Split term collection count and sub_agg (Faster term agg with less memory consumption for cases without sub-aggs) [#1921](https://github.com/quickwit-oss/tantivy/issues/1921) (@PSeitz)
+  - Schemaless aggregations: In combination with stacker tantivy supports now schemaless aggregations via the JSON type.
+    - Add aggregation support for JSON type [#1888](https://github.com/quickwit-oss/tantivy/issues/1888) (@PSeitz)
+    - Mixed types support on JSON fields in aggs [#1971](https://github.com/quickwit-oss/tantivy/issues/1971) (@PSeitz)
+  - Perf: Fetch blocks of vals in aggregation for all cardinality [#1950](https://github.com/quickwit-oss/tantivy/issues/1950) (@PSeitz)
+  - Allow histogram bounds to be passed as Rfc3339 [#2076](https://github.com/quickwit-oss/tantivy/issues/2076) (@PSeitz)
+- `Searcher` with disabled scoring via `EnableScoring::Disabled` [#1780](https://github.com/quickwit-oss/tantivy/issues/1780) (@shikhar)
+- Enable tokenizer on json fields [#2053](https://github.com/quickwit-oss/tantivy/issues/2053) (@PSeitz)
+- Enforcing "NOT" and "-" queries consistency in UserInputAst [#1609](https://github.com/quickwit-oss/tantivy/issues/1609) (@bazhenov)
+- Faster indexing
+  - Refactor tokenization pipeline to use GATs [#1924](https://github.com/quickwit-oss/tantivy/issues/1924) (@trinity-1686a)
+  - Faster term hash map [#2058](https://github.com/quickwit-oss/tantivy/issues/2058)[#1940](https://github.com/quickwit-oss/tantivy/issues/1940) (@PSeitz)
+  - tokenizer-api: reduce Tokenizer allocation overhead [#2062](https://github.com/quickwit-oss/tantivy/issues/2062) (@PSeitz)
+  - Refactor vint [#2010](https://github.com/quickwit-oss/tantivy/issues/2010) (@PSeitz)
+- Faster search
+  - Work in batches of docs on the SegmentCollector (Only for cases without score for now) [#1937](https://github.com/quickwit-oss/tantivy/issues/1937) (@PSeitz)
+  - Faster fast field range queries using SIMD [#1954](https://github.com/quickwit-oss/tantivy/issues/1954) (@fulmicoton)
+  - Improve fast field range query performance [#1864](https://github.com/quickwit-oss/tantivy/issues/1864) (@PSeitz)
+- Make BM25 scoring more flexible [#1855](https://github.com/quickwit-oss/tantivy/issues/1855) (@alexcole)
+- Switch fs2 to fs4 as it is now unmaintained and does not support illumos [#1944](https://github.com/quickwit-oss/tantivy/issues/1944) (@Toasterson)
+- Made BooleanWeight and BoostWeight public [#1991](https://github.com/quickwit-oss/tantivy/issues/1991) (@fulmicoton)
+- Make index compatible with virtual drives on Windows [#1843](https://github.com/quickwit-oss/tantivy/issues/1843) (@gyk)
+- Add stop words for Hungarian language [#2069](https://github.com/quickwit-oss/tantivy/issues/2069) (@tnxbutno)
+- Auto downgrade index record option, instead of vint error [#1857](https://github.com/quickwit-oss/tantivy/issues/1857) (@PSeitz)
+- Enable range query on fast field for u64 compatible types [#1762](https://github.com/quickwit-oss/tantivy/issues/1762) (@PSeitz) [#1876]
+- sstable
+  - Isolating sstable and stacker in independant crates. [#1718](https://github.com/quickwit-oss/tantivy/issues/1718) (@fulmicoton)
+  - New sstable format [#1943](https://github.com/quickwit-oss/tantivy/issues/1943)[#1953](https://github.com/quickwit-oss/tantivy/issues/1953) (@trinity-1686a)
+  - Use DeltaReader directly to implement Dictionnary::ord_to_term [#1928](https://github.com/quickwit-oss/tantivy/issues/1928) (@trinity-1686a)
+  - Use DeltaReader directly to implement Dictionnary::term_ord [#1925](https://github.com/quickwit-oss/tantivy/issues/1925) (@trinity-1686a)
+- Add seperate tokenizer manager for fast fields [#2019](https://github.com/quickwit-oss/tantivy/issues/2019) (@PSeitz)
+- Make construction of LevenshteinAutomatonBuilder for FuzzyTermQuery instances lazy. [#1756](https://github.com/quickwit-oss/tantivy/issues/1756) (@adamreichold)
+- Added support for madvise when opening an mmaped Index [#2036](https://github.com/quickwit-oss/tantivy/issues/2036) (@fulmicoton)
+- Rename `DatePrecision` to `DateTimePrecision` [#2051](https://github.com/quickwit-oss/tantivy/issues/2051) (@guilload)
+- Query Parser
+  - Quotation mark can now be used for phrase queries. [#2050](https://github.com/quickwit-oss/tantivy/issues/2050) (@fulmicoton)
+  - PhrasePrefixQuery is supported in the query parser via: `field:"phrase ter"*` [#2044](https://github.com/quickwit-oss/tantivy/issues/2044) (@adamreichold)
+- Docs
+  - Update examples for literate docs [#1880](https://github.com/quickwit-oss/tantivy/issues/1880) (@PSeitz)
+  - Add ip field example [#1775](https://github.com/quickwit-oss/tantivy/issues/1775) (@PSeitz)
+  - Fix doc store cache documentation [#1821](https://github.com/quickwit-oss/tantivy/issues/1821) (@PSeitz)
+  - Fix BooleanQuery document [#1999](https://github.com/quickwit-oss/tantivy/issues/1999) (@RT_Enzyme)
+  - Update comments in the faceted search example [#1737](https://github.com/quickwit-oss/tantivy/issues/1737) (@DawChihLiou)
+
+
 Tantivy 0.19
 ================================
 #### Bugfixes
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -1,6 +1,6 @@
 [package]
 name = "tantivy"
-version = "0.19.0"
+version = "0.21.0"
 authors = ["Paul Masurel <paul.masurel@gmail.com>"]
 license = "MIT"
 categories = ["database-implementations", "data-structures"]
@@ -12,27 +12,27 @@ readme = "README.md"
 keywords = ["search", "information", "retrieval"]
 edition = "2021"
 rust-version = "1.62"
+exclude = ["benches/*.json", "benches/*.txt"]

 [dependencies]
 oneshot = "0.1.5"
 base64 = "0.21.0"
 byteorder = "1.4.3"
 crc32fast = "1.3.2"
+tracing = "0.1"
 once_cell = "1.10.0"
 regex = { version = "1.5.5", default-features = false, features = ["std", "unicode"] }
-aho-corasick = "0.7"
+aho-corasick = "1.0"
 tantivy-fst = "0.4.0"
-memmap2 = { version = "0.5.3", optional = true }
-lz4_flex = { version = "0.9.2", default-features = false, features = ["checked-decode"], optional = true }
-brotli = { version = "3.3.4", optional = true }
+memmap2 = { version = "0.7.1", optional = true }
+lz4_flex = { version = "0.11", default-features = false, optional = true }
 zstd = { version = "0.12", optional = true, default-features = false }
-snap = { version = "1.0.5", optional = true }
 tempfile = { version = "3.3.0", optional = true }
 log = "0.4.16"
 serde = { version = "1.0.136", features = ["derive"] }
 serde_json = "1.0.79"
 num_cpus = "1.13.1"
-fs2 = { version = "0.4.3", optional = true }
+fs4 = { version = "0.6.3", optional = true }
 levenshtein_automata = "0.2.1"
 uuid = { version = "1.0.0", features = ["v4", "serde"] }
 crossbeam-channel = "0.5.4"
@@ -43,26 +43,27 @@ census = "0.4.0"
 rustc-hash = "1.1.0"
 thiserror = "1.0.30"
 htmlescape = "0.3.1"
-fail = "0.5.0"
-murmurhash32 = "0.2.0"
+fail = { version = "0.5.0", optional = true }
+murmurhash32 = "0.3.0"
 time = { version = "0.3.10", features = ["serde-well-known"] }
 smallvec = "1.8.0"
 rayon = "1.5.2"
-lru = "0.9.0"
+lru = "0.11.0"
 fastdivide = "0.4.0"
-itertools = "0.10.3"
+itertools = "0.11.0"
 measure_time = "0.8.2"
 async-trait = "0.1.53"
 arc-swap = "1.5.0"

-columnar = { version="0.1", path="./columnar", package ="tantivy-columnar" }
-sstable = { version="0.1", path="./sstable", package ="tantivy-sstable", optional = true }
-stacker = { version="0.1", path="./stacker", package ="tantivy-stacker" }
-tantivy-query-grammar = { version= "0.19.0", path="./query-grammar" }
-tantivy-bitpacker = 		{ version= "0.3", path="./bitpacker" }
-common = 								{ version= "0.5", path = "./common/", package = "tantivy-common" }
-fastfield_codecs = 			{ version= "0.3", path="./fastfield_codecs", default-features = false }
-tokenizer-api = { version="0.1", path="./tokenizer-api", package="tantivy-tokenizer-api" }
+columnar = { version= "0.2", path="./columnar", package ="tantivy-columnar" }
+sstable = { version= "0.2", path="./sstable", package ="tantivy-sstable", optional = true }
+stacker = { version= "0.2", path="./stacker", package ="tantivy-stacker" }
+query-grammar = { version= "0.21.0", path="./query-grammar", package = "tantivy-query-grammar" }
+tantivy-bitpacker = { version= "0.5", path="./bitpacker" }
+common = { version= "0.6", path = "./common/", package = "tantivy-common" }
+tokenizer-api = { version= "0.2", path="./tokenizer-api", package="tantivy-tokenizer-api" }
+sketches-ddsketch = { version = "0.2.1", features = ["use_serde"] }
+futures-util = { version = "0.3.28", optional = true }

 [target.'cfg(windows)'.dependencies]
 winapi = "0.3.9"
@@ -73,11 +74,16 @@ maplit = "1.0.2"
 matches = "0.1.9"
 pretty_assertions = "1.2.1"
 proptest = "1.0.0"
-criterion = "0.4"
 test-log = "0.2.10"
 env_logger = "0.10.0"
-pprof = { version = "0.11.0", features = ["flamegraph", "criterion"] }
 futures = "0.3.21"
+paste = "1.0.11"
+more-asserts = "0.3.1"
+rand_distr = "0.4.3"
+
+[target.'cfg(not(windows))'.dev-dependencies]
+criterion = "0.5"
+pprof = { git = "https://github.com/PSeitz/pprof-rs/", rev = "53af24b", features = ["flamegraph", "criterion"] } # temp fork that works with criterion 0.5

 [dev-dependencies.fail]
 version = "0.5.0"
@@ -88,27 +94,30 @@ opt-level = 3
 debug = false
 debug-assertions = false

+[profile.bench]
+opt-level = 3
+debug = true
+debug-assertions = false
+
 [profile.test]
 debug-assertions = true
 overflow-checks = true

 [features]
 default = ["mmap", "stopwords", "lz4-compression"]
-mmap = ["fs2", "tempfile", "memmap2"]
+mmap = ["fs4", "tempfile", "memmap2"]
 stopwords = []

-brotli-compression = ["brotli"]
 lz4-compression = ["lz4_flex"]
-snappy-compression = ["snap"]
 zstd-compression = ["zstd"]

-failpoints = ["fail/failpoints"]
+failpoints = ["fail", "fail/failpoints"]
 unstable = [] # useful for benches.

-quickwit = ["sstable"]
+quickwit = ["sstable", "futures-util"]

 [workspace]
-members = ["query-grammar", "bitpacker", "common", "fastfield_codecs", "ownedbytes", "stacker", "sstable", "tokenizer-api", "columnar"]
+members = ["query-grammar", "bitpacker", "common", "ownedbytes", "stacker", "sstable", "tokenizer-api", "columnar"]

 # Following the "fail" crate best practises, we isolate
 # tests that define specific behavior in fail check points
@@ -120,7 +129,7 @@ members = ["query-grammar", "bitpacker", "common", "fastfield_codecs", "ownedbyt
 [[test]]
 name = "failpoints"
 path = "tests/failpoints/mod.rs"
-required-features = ["fail/failpoints"]
+required-features = ["failpoints"]

 [[bench]]
 name = "analyzer"
@@ -129,4 +138,3 @@ harness = false
 [[bench]]
 name = "index-bench"
 harness = false
-
--- a/2
+++ b/2
@@ -1,5 +1,5 @@
 test:
-	echo "Run test only... No examples."
+	@echo "Run test only... No examples."
 	cargo test --tests --lib

 fmt:
--- a/README.md
+++ b/README.md
@@ -26,6 +26,8 @@ Your mileage WILL vary depending on the nature of queries and their load.

 <img src="doc/assets/images/searchbenchmark.png">

+Details about the benchmark can be found at this [repository](https://github.com/quickwit-oss/search-benchmark-game).
+
 # Features

 - Full-text search
@@ -42,7 +44,7 @@ Your mileage WILL vary depending on the nature of queries and their load.
 - Single valued and multivalued u64, i64, and f64 fast fields (equivalent of doc values in Lucene)
 - `&[u8]` fast fields
 - Text, i64, u64, f64, dates, ip, bool, and hierarchical facet fields
- Compressed document store (LZ4, Zstd, None, Brotli, Snap)
+- Compressed document store (LZ4, Zstd, None)
 - Range queries
 - Faceted search
 - Configurable indexing (optional term frequency and position indexing)
--- a/RELEASE.md
+++ b/RELEASE.md
@@ -0,0 +1,21 @@
+# Release a new Tantivy Version
+
+## Steps
+
+1. Identify new packages in workspace since last release
+2. Identify changed packages in workspace since last release
+3. Bump version in `Cargo.toml` and their dependents for all changed packages
+4. Update version of root `Cargo.toml`
+5. Publish version starting with leaf nodes
+6. Set git tag with new version
+
+
+In conjucation with `cargo-release` Steps 1-4 (I'm not sure if the change detection works):
+Set new packages to version 0.0.0
+
+Replace prev-tag-name
+```bash
+cargo release --workspace --no-publish -v --prev-tag-name 0.19 --push-remote origin minor --no-tag --execute
+```
+
+no-tag or it will create tags for all the subpackages
--- a/TODO.txt
+++ b/TODO.txt
@@ -0,0 +1,18 @@
+Make schema_builder API fluent.
+fix doc serialization and prevent compression problems
+
+u64 , etc. shoudl return Resutl<Option> now that we support optional missing a column is really not an error
+remove fastfield codecs
+ditch the first_or_default trick. if it is still useful, improve its implementation.
+rename FastFieldReaders::open to load
+
+
+remove fast field reader
+
+find a way to unify the two DateTime.
+readd type check in the filter wrapper
+
+add unit test on columnar list columns.
+
+make sure sort works
+
--- a/appveyor.yml
+++ b/appveyor.yml
@@ -1,23 +0,0 @@
-# Appveyor configuration template for Rust using rustup for Rust installation
-# https://github.com/starkat99/appveyor-rust
-
-os: Visual Studio 2015
-environment:
-  matrix:
-    - channel: stable
-      target: x86_64-pc-windows-msvc
-
-install:
-  - appveyor DownloadFile https://win.rustup.rs/ -FileName rustup-init.exe
-  - rustup-init -yv --default-toolchain %channel% --default-host %target%
-  - set PATH=%PATH%;%USERPROFILE%\.cargo\bin
-  - if defined msys_bits set PATH=%PATH%;C:\msys64\mingw%msys_bits%\bin
-  - rustc -vV
-  - cargo -vV
-
-build: false
-
-test_script:
-  - REM SET RUST_LOG=tantivy,test & cargo test --all --verbose --no-default-features --features lz4-compression --features mmap
-  - REM SET RUST_LOG=tantivy,test & cargo test test_store --verbose --no-default-features --features lz4-compression --features snappy-compression --features brotli-compression --features mmap
-  - REM SET RUST_BACKTRACE=1 & cargo build --examples
--- a/benches/analyzer.rs
+++ b/benches/analyzer.rs
@@ -1,11 +1,13 @@
 use criterion::{criterion_group, criterion_main, Criterion};
-use tantivy::tokenizer::TokenizerManager;
+use tantivy::tokenizer::{
+    LowerCaser, RemoveLongFilter, SimpleTokenizer, TextAnalyzer, TokenizerManager,
+};

 const ALICE_TXT: &str = include_str!("alice.txt");

 pub fn criterion_benchmark(c: &mut Criterion) {
    let tokenizer_manager = TokenizerManager::default();
-    let tokenizer = tokenizer_manager.get("default").unwrap();
+    let mut tokenizer = tokenizer_manager.get("default").unwrap();
    c.bench_function("default-tokenize-alice", |b| {
        b.iter(|| {
            let mut word_count = 0;
@@ -16,7 +18,26 @@ pub fn criterion_benchmark(c: &mut Criterion) {
            assert_eq!(word_count, 30_731);
        })
    });
+    let mut dynamic_analyzer = TextAnalyzer::builder(SimpleTokenizer::default())
+        .dynamic()
+        .filter_dynamic(RemoveLongFilter::limit(40))
+        .filter_dynamic(LowerCaser)
+        .build();
+    c.bench_function("dynamic-tokenize-alice", |b| {
+        b.iter(|| {
+            let mut word_count = 0;
+            let mut token_stream = dynamic_analyzer.token_stream(ALICE_TXT);
+            while token_stream.advance() {
+                word_count += 1;
+            }
+            assert_eq!(word_count, 30_731);
+        })
+    });
 }

-criterion_group!(benches, criterion_benchmark);
+criterion_group! {
+    name = benches;
+    config = Criterion::default().sample_size(200);
+    targets = criterion_benchmark
+}
 criterion_main!(benches);
--- a/benches/gh.json
+++ b/benches/gh.json
--- a/benches/index-bench.rs
+++ b/benches/index-bench.rs
@@ -1,10 +1,15 @@
-use criterion::{criterion_group, criterion_main, Criterion};
+use criterion::{criterion_group, criterion_main, Criterion, Throughput};
 use pprof::criterion::{Output, PProfProfiler};
-use tantivy::schema::{INDEXED, STORED, STRING, TEXT};
+use tantivy::schema::{FAST, INDEXED, STORED, STRING, TEXT};
 use tantivy::Index;

 const HDFS_LOGS: &str = include_str!("hdfs.json");
-const NUM_REPEATS: usize = 2;
+const GH_LOGS: &str = include_str!("gh.json");
+const WIKI: &str = include_str!("wiki.json");
+
+fn get_lines(input: &str) -> Vec<&str> {
+    input.trim().split('\n').collect()
+}

 pub fn hdfs_index_benchmark(c: &mut Criterion) {
    let schema = {
@@ -28,85 +33,147 @@ pub fn hdfs_index_benchmark(c: &mut Criterion) {
    };

    let mut group = c.benchmark_group("index-hdfs");
+    group.throughput(Throughput::Bytes(HDFS_LOGS.len() as u64));
    group.sample_size(20);
    group.bench_function("index-hdfs-no-commit", |b| {
+        let lines = get_lines(HDFS_LOGS);
        b.iter(|| {
            let index = Index::create_in_ram(schema.clone());
            let index_writer = index.writer_with_num_threads(1, 100_000_000).unwrap();
-            for _ in 0..NUM_REPEATS {
-                for doc_json in HDFS_LOGS.trim().split('\n') {
-                    let doc = schema.parse_document(doc_json).unwrap();
-                    index_writer.add_document(doc).unwrap();
-                }
+            for doc_json in &lines {
+                let doc = schema.parse_document(doc_json).unwrap();
+                index_writer.add_document(doc).unwrap();
            }
        })
    });
    group.bench_function("index-hdfs-with-commit", |b| {
+        let lines = get_lines(HDFS_LOGS);
        b.iter(|| {
            let index = Index::create_in_ram(schema.clone());
            let mut index_writer = index.writer_with_num_threads(1, 100_000_000).unwrap();
-            for _ in 0..NUM_REPEATS {
-                for doc_json in HDFS_LOGS.trim().split('\n') {
-                    let doc = schema.parse_document(doc_json).unwrap();
-                    index_writer.add_document(doc).unwrap();
-                }
+            for doc_json in &lines {
+                let doc = schema.parse_document(doc_json).unwrap();
+                index_writer.add_document(doc).unwrap();
            }
            index_writer.commit().unwrap();
        })
    });
    group.bench_function("index-hdfs-no-commit-with-docstore", |b| {
+        let lines = get_lines(HDFS_LOGS);
        b.iter(|| {
            let index = Index::create_in_ram(schema_with_store.clone());
            let index_writer = index.writer_with_num_threads(1, 100_000_000).unwrap();
-            for _ in 0..NUM_REPEATS {
-                for doc_json in HDFS_LOGS.trim().split('\n') {
-                    let doc = schema.parse_document(doc_json).unwrap();
-                    index_writer.add_document(doc).unwrap();
-                }
+            for doc_json in &lines {
+                let doc = schema.parse_document(doc_json).unwrap();
+                index_writer.add_document(doc).unwrap();
            }
        })
    });
    group.bench_function("index-hdfs-with-commit-with-docstore", |b| {
+        let lines = get_lines(HDFS_LOGS);
        b.iter(|| {
            let index = Index::create_in_ram(schema_with_store.clone());
            let mut index_writer = index.writer_with_num_threads(1, 100_000_000).unwrap();
-            for _ in 0..NUM_REPEATS {
-                for doc_json in HDFS_LOGS.trim().split('\n') {
-                    let doc = schema.parse_document(doc_json).unwrap();
-                    index_writer.add_document(doc).unwrap();
-                }
+            for doc_json in &lines {
+                let doc = schema.parse_document(doc_json).unwrap();
+                index_writer.add_document(doc).unwrap();
            }
            index_writer.commit().unwrap();
        })
    });
    group.bench_function("index-hdfs-no-commit-json-without-docstore", |b| {
+        let lines = get_lines(HDFS_LOGS);
        b.iter(|| {
            let index = Index::create_in_ram(dynamic_schema.clone());
            let json_field = dynamic_schema.get_field("json").unwrap();
            let mut index_writer = index.writer_with_num_threads(1, 100_000_000).unwrap();
-            for _ in 0..NUM_REPEATS {
-                for doc_json in HDFS_LOGS.trim().split('\n') {
-                    let json_val: serde_json::Map<String, serde_json::Value> =
-                        serde_json::from_str(doc_json).unwrap();
-                    let doc = tantivy::doc!(json_field=>json_val);
-                    index_writer.add_document(doc).unwrap();
-                }
+            for doc_json in &lines {
+                let json_val: serde_json::Map<String, serde_json::Value> =
+                    serde_json::from_str(doc_json).unwrap();
+                let doc = tantivy::doc!(json_field=>json_val);
+                index_writer.add_document(doc).unwrap();
            }
            index_writer.commit().unwrap();
        })
    });
-    group.bench_function("index-hdfs-with-commit-json-without-docstore", |b| {
+}
+
+pub fn gh_index_benchmark(c: &mut Criterion) {
+    let dynamic_schema = {
+        let mut schema_builder = tantivy::schema::SchemaBuilder::new();
+        schema_builder.add_json_field("json", TEXT | FAST);
+        schema_builder.build()
+    };
+
+    let mut group = c.benchmark_group("index-gh");
+    group.throughput(Throughput::Bytes(GH_LOGS.len() as u64));
+
+    group.bench_function("index-gh-no-commit", |b| {
+        let lines = get_lines(GH_LOGS);
        b.iter(|| {
-            let index = Index::create_in_ram(dynamic_schema.clone());
            let json_field = dynamic_schema.get_field("json").unwrap();
+            let index = Index::create_in_ram(dynamic_schema.clone());
+            let index_writer = index.writer_with_num_threads(1, 100_000_000).unwrap();
+            for doc_json in &lines {
+                let json_val: serde_json::Map<String, serde_json::Value> =
+                    serde_json::from_str(doc_json).unwrap();
+                let doc = tantivy::doc!(json_field=>json_val);
+                index_writer.add_document(doc).unwrap();
+            }
+        })
+    });
+    group.bench_function("index-gh-with-commit", |b| {
+        let lines = get_lines(GH_LOGS);
+        b.iter(|| {
+            let json_field = dynamic_schema.get_field("json").unwrap();
+            let index = Index::create_in_ram(dynamic_schema.clone());
            let mut index_writer = index.writer_with_num_threads(1, 100_000_000).unwrap();
-            for _ in 0..NUM_REPEATS {
-                for doc_json in HDFS_LOGS.trim().split('\n') {
-                    let json_val: serde_json::Map<String, serde_json::Value> =
-                        serde_json::from_str(doc_json).unwrap();
-                    let doc = tantivy::doc!(json_field=>json_val);
-                    index_writer.add_document(doc).unwrap();
-                }
+            for doc_json in &lines {
+                let json_val: serde_json::Map<String, serde_json::Value> =
+                    serde_json::from_str(doc_json).unwrap();
+                let doc = tantivy::doc!(json_field=>json_val);
+                index_writer.add_document(doc).unwrap();
+            }
+            index_writer.commit().unwrap();
+        })
+    });
+}
+
+pub fn wiki_index_benchmark(c: &mut Criterion) {
+    let dynamic_schema = {
+        let mut schema_builder = tantivy::schema::SchemaBuilder::new();
+        schema_builder.add_json_field("json", TEXT | FAST);
+        schema_builder.build()
+    };
+
+    let mut group = c.benchmark_group("index-wiki");
+    group.throughput(Throughput::Bytes(WIKI.len() as u64));
+
+    group.bench_function("index-wiki-no-commit", |b| {
+        let lines = get_lines(WIKI);
+        b.iter(|| {
+            let json_field = dynamic_schema.get_field("json").unwrap();
+            let index = Index::create_in_ram(dynamic_schema.clone());
+            let index_writer = index.writer_with_num_threads(1, 100_000_000).unwrap();
+            for doc_json in &lines {
+                let json_val: serde_json::Map<String, serde_json::Value> =
+                    serde_json::from_str(doc_json).unwrap();
+                let doc = tantivy::doc!(json_field=>json_val);
+                index_writer.add_document(doc).unwrap();
+            }
+        })
+    });
+    group.bench_function("index-wiki-with-commit", |b| {
+        let lines = get_lines(WIKI);
+        b.iter(|| {
+            let json_field = dynamic_schema.get_field("json").unwrap();
+            let index = Index::create_in_ram(dynamic_schema.clone());
+            let mut index_writer = index.writer_with_num_threads(1, 100_000_000).unwrap();
+            for doc_json in &lines {
+                let json_val: serde_json::Map<String, serde_json::Value> =
+                    serde_json::from_str(doc_json).unwrap();
+                let doc = tantivy::doc!(json_field=>json_val);
+                index_writer.add_document(doc).unwrap();
            }
            index_writer.commit().unwrap();
        })
@@ -115,7 +182,17 @@ pub fn hdfs_index_benchmark(c: &mut Criterion) {

 criterion_group! {
    name = benches;
-    config = Criterion::default().with_profiler(PProfProfiler::new(100, Output::Flamegraph(None)));
+    config = Criterion::default();
    targets = hdfs_index_benchmark
 }
-criterion_main!(benches);
+criterion_group! {
+    name = gh_benches;
+    config = Criterion::default().with_profiler(PProfProfiler::new(100, Output::Flamegraph(None)));
+    targets = gh_index_benchmark
+}
+criterion_group! {
+    name = wiki_benches;
+    config = Criterion::default().with_profiler(PProfProfiler::new(100, Output::Flamegraph(None)));
+    targets = wiki_index_benchmark
+}
+criterion_main!(benches, gh_benches, wiki_benches);
--- a/benches/wiki.json
+++ b/benches/wiki.json
--- a/bitpacker/Cargo.toml
+++ b/bitpacker/Cargo.toml
@@ -1,6 +1,6 @@
 [package]
 name = "tantivy-bitpacker"
-version = "0.3.0"
+version = "0.5.0"
 edition = "2021"
 authors = ["Paul Masurel <paul.masurel@gmail.com>"]
 license = "MIT"
@@ -15,6 +15,7 @@ homepage = "https://github.com/quickwit-oss/tantivy"
 # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html

 [dependencies]
+bitpacking = {version="0.8", default-features=false, features = ["bitpacker1x"]}

 [dev-dependencies]
 rand = "0.8"
--- a/bitpacker/src/bitpacker.rs
+++ b/bitpacker/src/bitpacker.rs
@@ -1,10 +1,14 @@
 use std::convert::TryInto;
 use std::io;
+use std::ops::{Range, RangeInclusive};
+
+use bitpacking::{BitPacker as ExternalBitPackerTrait, BitPacker1x};

 pub struct BitPacker {
    mini_buffer: u64,
    mini_buffer_written: usize,
 }
+
 impl Default for BitPacker {
    fn default() -> Self {
        BitPacker::new()
@@ -19,7 +23,7 @@ impl BitPacker {
    }

    #[inline]
-    pub fn write<TWrite: io::Write>(
+    pub fn write<TWrite: io::Write + ?Sized>(
        &mut self,
        val: u64,
        num_bits: u8,
@@ -43,7 +47,7 @@ impl BitPacker {
        Ok(())
    }

-    pub fn flush<TWrite: io::Write>(&mut self, output: &mut TWrite) -> io::Result<()> {
+    pub fn flush<TWrite: io::Write + ?Sized>(&mut self, output: &mut TWrite) -> io::Result<()> {
        if self.mini_buffer_written > 0 {
            let num_bytes = (self.mini_buffer_written + 7) / 8;
            let bytes = self.mini_buffer.to_le_bytes();
@@ -54,7 +58,7 @@ impl BitPacker {
        Ok(())
    }

-    pub fn close<TWrite: io::Write>(&mut self, output: &mut TWrite) -> io::Result<()> {
+    pub fn close<TWrite: io::Write + ?Sized>(&mut self, output: &mut TWrite) -> io::Result<()> {
        self.flush(output)?;
        Ok(())
    }
@@ -118,6 +122,125 @@ impl BitUnpacker {
        let val_shifted = val_unshifted_unmasked >> bit_shift;
        val_shifted & self.mask
    }
+
+    // Decodes the range of bitpacked `u32` values with idx
+    // in [start_idx, start_idx + output.len()).
+    //
+    // #Panics
+    //
+    // This methods panics if `num_bits` is > 32.
+    fn get_batch_u32s(&self, start_idx: u32, data: &[u8], output: &mut [u32]) {
+        assert!(
+            self.bit_width() <= 32,
+            "Bitwidth must be <= 32 to use this method."
+        );
+
+        let end_idx = start_idx + output.len() as u32;
+
+        let end_bit_read = end_idx * self.num_bits;
+        let end_byte_read = (end_bit_read + 7) / 8;
+        assert!(
+            end_byte_read as usize <= data.len(),
+            "Requested index is out of bounds."
+        );
+
+        // Simple slow implementation of get_batch_u32s, to deal with our ramps.
+        let get_batch_ramp = |start_idx: u32, output: &mut [u32]| {
+            for (out, idx) in output.iter_mut().zip(start_idx..) {
+                *out = self.get(idx, data) as u32;
+            }
+        };
+
+        // We use an unrolled routine to decode 32 values at once.
+        // We therefore decompose our range of values to decode into three ranges:
+        // - Entrance ramp: [start_idx, fast_track_start) (up to 31 values)
+        // - Highway: [fast_track_start, fast_track_end) (a length multiple of 32s)
+        // - Exit ramp: [fast_track_end, start_idx + output.len()) (up to 31 values)
+
+        // We want the start of the fast track to start align with bytes.
+        // A sufficient condition is to start with an idx that is a multiple of 8,
+        // so highway start is the closest multiple of 8 that is >= start_idx.
+        let entrance_ramp_len = 8 - (start_idx % 8) % 8;
+
+        let highway_start: u32 = start_idx + entrance_ramp_len;
+
+        if highway_start + BitPacker1x::BLOCK_LEN as u32 > end_idx {
+            // We don't have enough values to have even a single block of highway.
+            // Let's just supply the values the simple way.
+            get_batch_ramp(start_idx, output);
+            return;
+        }
+
+        let num_blocks: u32 = (end_idx - highway_start) / BitPacker1x::BLOCK_LEN as u32;
+
+        // Entrance ramp
+        get_batch_ramp(start_idx, &mut output[..entrance_ramp_len as usize]);
+
+        // Highway
+        let mut offset = (highway_start * self.num_bits) as usize / 8;
+        let mut output_cursor = (highway_start - start_idx) as usize;
+        for _ in 0..num_blocks {
+            offset += BitPacker1x.decompress(
+                &data[offset..],
+                &mut output[output_cursor..],
+                self.num_bits as u8,
+            );
+            output_cursor += 32;
+        }
+
+        // Exit ramp
+        let highway_end = highway_start + num_blocks * BitPacker1x::BLOCK_LEN as u32;
+        get_batch_ramp(highway_end, &mut output[output_cursor..]);
+    }
+
+    pub fn get_ids_for_value_range(
+        &self,
+        range: RangeInclusive<u64>,
+        id_range: Range<u32>,
+        data: &[u8],
+        positions: &mut Vec<u32>,
+    ) {
+        if self.bit_width() > 32 {
+            self.get_ids_for_value_range_slow(range, id_range, data, positions)
+        } else {
+            if *range.start() > u32::MAX as u64 {
+                positions.clear();
+                return;
+            }
+            let range_u32 = (*range.start() as u32)..=(*range.end()).min(u32::MAX as u64) as u32;
+            self.get_ids_for_value_range_fast(range_u32, id_range, data, positions)
+        }
+    }
+
+    fn get_ids_for_value_range_slow(
+        &self,
+        range: RangeInclusive<u64>,
+        id_range: Range<u32>,
+        data: &[u8],
+        positions: &mut Vec<u32>,
+    ) {
+        positions.clear();
+        for i in id_range {
+            // If we cared we could make this branchless, but the slow implementation should rarely
+            // kick in.
+            let val = self.get(i, data);
+            if range.contains(&val) {
+                positions.push(i);
+            }
+        }
+    }
+
+    fn get_ids_for_value_range_fast(
+        &self,
+        value_range: RangeInclusive<u32>,
+        id_range: Range<u32>,
+        data: &[u8],
+        positions: &mut Vec<u32>,
+    ) {
+        positions.resize(id_range.len(), 0u32);
+        self.get_batch_u32s(id_range.start, data, positions);
+        crate::filter_vec::filter_vec_in_place(value_range, id_range.start, positions)
+    }
 }

 #[cfg(test)]
@@ -200,4 +323,58 @@ mod test {
            test_bitpacker_aux(num_bits, &vals);
        }
    }
+
+    #[test]
+    #[should_panic]
+    fn test_get_batch_panics_over_32_bits() {
+        let bitunpacker = BitUnpacker::new(33);
+        let mut output: [u32; 1] = [0u32];
+        bitunpacker.get_batch_u32s(0, &[0, 0, 0, 0, 0, 0, 0, 0], &mut output[..]);
+    }
+
+    #[test]
+    fn test_get_batch_limit() {
+        let bitunpacker = BitUnpacker::new(1);
+        let mut output: [u32; 3] = [0u32, 0u32, 0u32];
+        bitunpacker.get_batch_u32s(8 * 4 - 3, &[0u8, 0u8, 0u8, 0u8], &mut output[..]);
+    }
+
+    #[test]
+    #[should_panic]
+    fn test_get_batch_panics_when_off_scope() {
+        let bitunpacker = BitUnpacker::new(1);
+        let mut output: [u32; 3] = [0u32, 0u32, 0u32];
+        // We are missing exactly one bit.
+        bitunpacker.get_batch_u32s(8 * 4 - 2, &[0u8, 0u8, 0u8, 0u8], &mut output[..]);
+    }
+
+    proptest::proptest! {
+        #[test]
+        fn test_get_batch_u32s_proptest(num_bits in 0u8..=32u8) {
+            let mask =
+                if num_bits == 32u8 {
+                    u32::MAX
+                } else {
+                    (1u32 << num_bits) - 1
+                };
+            let mut buffer: Vec<u8> = Vec::new();
+            let mut bitpacker = BitPacker::new();
+            for val in 0..100 {
+                bitpacker.write(val & mask as u64, num_bits, &mut buffer).unwrap();
+            }
+            bitpacker.flush(&mut buffer).unwrap();
+            let bitunpacker = BitUnpacker::new(num_bits);
+            let mut output: Vec<u32> = Vec::new();
+            for len in [0, 1, 2, 32, 33, 34, 64] {
+                for start_idx in 0u32..32u32 {
+                    output.resize(len as usize, 0);
+                    bitunpacker.get_batch_u32s(start_idx, &buffer, &mut output);
+                    for i in 0..len {
+                        let expected = (start_idx + i as u32) & mask;
+                        assert_eq!(output[i], expected);
+                    }
+                }
+            }
+        }
+    }
 }
--- a/bitpacker/src/blocked_bitpacker.rs
+++ b/bitpacker/src/blocked_bitpacker.rs
@@ -64,10 +64,8 @@ fn mem_usage<T>(items: &Vec<T>) -> usize {

 impl BlockedBitpacker {
    pub fn new() -> Self {
-        let mut compressed_blocks = vec![];
-        compressed_blocks.resize(8, 0);
        Self {
-            compressed_blocks,
+            compressed_blocks: vec![0; 8],
            buffer: vec![],
            offset_and_bits: vec![],
        }
--- a/bitpacker/src/filter_vec/avx2.rs
+++ b/bitpacker/src/filter_vec/avx2.rs
@@ -0,0 +1,365 @@
+//! SIMD filtering of a vector as described in the following blog post.
+//! <https://quickwit.io/blog/filtering%20a%20vector%20with%20simd%20instructions%20avx-2%20and%20avx-512>
+use std::arch::x86_64::{
+    __m256i as DataType, _mm256_add_epi32 as op_add, _mm256_cmpgt_epi32 as op_greater,
+    _mm256_lddqu_si256 as load_unaligned, _mm256_or_si256 as op_or, _mm256_set1_epi32 as set1,
+    _mm256_storeu_si256 as store_unaligned, _mm256_xor_si256 as op_xor, *,
+};
+use std::ops::RangeInclusive;
+
+const NUM_LANES: usize = 8;
+
+const HIGHEST_BIT: u32 = 1 << 31;
+
+#[inline]
+fn u32_to_i32(val: u32) -> i32 {
+    (val ^ HIGHEST_BIT) as i32
+}
+
+#[inline]
+unsafe fn u32_to_i32_avx2(vals_u32x8s: DataType) -> DataType {
+    const HIGHEST_BIT_MASK: DataType = from_u32x8([HIGHEST_BIT; NUM_LANES]);
+    op_xor(vals_u32x8s, HIGHEST_BIT_MASK)
+}
+
+pub fn filter_vec_in_place(range: RangeInclusive<u32>, offset: u32, output: &mut Vec<u32>) {
+    // We use a monotonic mapping from u32 to i32 to make the comparison possible in AVX2.
+    let range_i32: RangeInclusive<i32> = u32_to_i32(*range.start())..=u32_to_i32(*range.end());
+    let num_words = output.len() / NUM_LANES;
+    let mut output_len = unsafe {
+        filter_vec_avx2_aux(
+            output.as_ptr() as *const __m256i,
+            range_i32,
+            output.as_mut_ptr(),
+            offset,
+            num_words,
+        )
+    };
+    let reminder_start = num_words * NUM_LANES;
+    for i in reminder_start..output.len() {
+        let val = output[i];
+        output[output_len] = offset + i as u32;
+        output_len += if range.contains(&val) { 1 } else { 0 };
+    }
+    output.truncate(output_len);
+}
+
+#[target_feature(enable = "avx2")]
+unsafe fn filter_vec_avx2_aux(
+    mut input: *const __m256i,
+    range: RangeInclusive<i32>,
+    output: *mut u32,
+    offset: u32,
+    num_words: usize,
+) -> usize {
+    let mut output_tail = output;
+    let range_simd = set1(*range.start())..=set1(*range.end());
+    let mut ids = from_u32x8([
+        offset,
+        offset + 1,
+        offset + 2,
+        offset + 3,
+        offset + 4,
+        offset + 5,
+        offset + 6,
+        offset + 7,
+    ]);
+    const SHIFT: __m256i = from_u32x8([NUM_LANES as u32; NUM_LANES]);
+    for _ in 0..num_words {
+        let word = load_unaligned(input);
+        let word = u32_to_i32_avx2(word);
+        let keeper_bitset = compute_filter_bitset(word, range_simd.clone());
+        let added_len = keeper_bitset.count_ones();
+        let filtered_doc_ids = compact(ids, keeper_bitset);
+        store_unaligned(output_tail as *mut __m256i, filtered_doc_ids);
+        output_tail = output_tail.offset(added_len as isize);
+        ids = op_add(ids, SHIFT);
+        input = input.offset(1);
+    }
+    output_tail.offset_from(output) as usize
+}
+
+#[inline]
+#[target_feature(enable = "avx2")]
+unsafe fn compact(data: DataType, mask: u8) -> DataType {
+    let vperm_mask = MASK_TO_PERMUTATION[mask as usize];
+    _mm256_permutevar8x32_epi32(data, vperm_mask)
+}
+
+#[inline]
+#[target_feature(enable = "avx2")]
+unsafe fn compute_filter_bitset(val: __m256i, range: std::ops::RangeInclusive<__m256i>) -> u8 {
+    let too_low = op_greater(*range.start(), val);
+    let too_high = op_greater(val, *range.end());
+    let inside = op_or(too_low, too_high);
+    255 - std::arch::x86_64::_mm256_movemask_ps(std::mem::transmute::<DataType, __m256>(inside))
+        as u8
+}
+
+union U8x32 {
+    vector: DataType,
+    vals: [u32; NUM_LANES],
+}
+
+const fn from_u32x8(vals: [u32; NUM_LANES]) -> DataType {
+    unsafe { U8x32 { vals }.vector }
+}
+
+const MASK_TO_PERMUTATION: [DataType; 256] = [
+    from_u32x8([0, 0, 0, 0, 0, 0, 0, 0]),
+    from_u32x8([0, 0, 0, 0, 0, 0, 0, 0]),
+    from_u32x8([1, 0, 0, 0, 0, 0, 0, 0]),
+    from_u32x8([0, 1, 0, 0, 0, 0, 0, 0]),
+    from_u32x8([2, 0, 0, 0, 0, 0, 0, 0]),
+    from_u32x8([0, 2, 0, 0, 0, 0, 0, 0]),
+    from_u32x8([1, 2, 0, 0, 0, 0, 0, 0]),
+    from_u32x8([0, 1, 2, 0, 0, 0, 0, 0]),
+    from_u32x8([3, 0, 0, 0, 0, 0, 0, 0]),
+    from_u32x8([0, 3, 0, 0, 0, 0, 0, 0]),
+    from_u32x8([1, 3, 0, 0, 0, 0, 0, 0]),
+    from_u32x8([0, 1, 3, 0, 0, 0, 0, 0]),
+    from_u32x8([2, 3, 0, 0, 0, 0, 0, 0]),
+    from_u32x8([0, 2, 3, 0, 0, 0, 0, 0]),
+    from_u32x8([1, 2, 3, 0, 0, 0, 0, 0]),
+    from_u32x8([0, 1, 2, 3, 0, 0, 0, 0]),
+    from_u32x8([4, 0, 0, 0, 0, 0, 0, 0]),
+    from_u32x8([0, 4, 0, 0, 0, 0, 0, 0]),
+    from_u32x8([1, 4, 0, 0, 0, 0, 0, 0]),
+    from_u32x8([0, 1, 4, 0, 0, 0, 0, 0]),
+    from_u32x8([2, 4, 0, 0, 0, 0, 0, 0]),
+    from_u32x8([0, 2, 4, 0, 0, 0, 0, 0]),
+    from_u32x8([1, 2, 4, 0, 0, 0, 0, 0]),
+    from_u32x8([0, 1, 2, 4, 0, 0, 0, 0]),
+    from_u32x8([3, 4, 0, 0, 0, 0, 0, 0]),
+    from_u32x8([0, 3, 4, 0, 0, 0, 0, 0]),
+    from_u32x8([1, 3, 4, 0, 0, 0, 0, 0]),
+    from_u32x8([0, 1, 3, 4, 0, 0, 0, 0]),
+    from_u32x8([2, 3, 4, 0, 0, 0, 0, 0]),
+    from_u32x8([0, 2, 3, 4, 0, 0, 0, 0]),
+    from_u32x8([1, 2, 3, 4, 0, 0, 0, 0]),
+    from_u32x8([0, 1, 2, 3, 4, 0, 0, 0]),
+    from_u32x8([5, 0, 0, 0, 0, 0, 0, 0]),
+    from_u32x8([0, 5, 0, 0, 0, 0, 0, 0]),
+    from_u32x8([1, 5, 0, 0, 0, 0, 0, 0]),
+    from_u32x8([0, 1, 5, 0, 0, 0, 0, 0]),
+    from_u32x8([2, 5, 0, 0, 0, 0, 0, 0]),
+    from_u32x8([0, 2, 5, 0, 0, 0, 0, 0]),
+    from_u32x8([1, 2, 5, 0, 0, 0, 0, 0]),
+    from_u32x8([0, 1, 2, 5, 0, 0, 0, 0]),
+    from_u32x8([3, 5, 0, 0, 0, 0, 0, 0]),
+    from_u32x8([0, 3, 5, 0, 0, 0, 0, 0]),
+    from_u32x8([1, 3, 5, 0, 0, 0, 0, 0]),
+    from_u32x8([0, 1, 3, 5, 0, 0, 0, 0]),
+    from_u32x8([2, 3, 5, 0, 0, 0, 0, 0]),
+    from_u32x8([0, 2, 3, 5, 0, 0, 0, 0]),
+    from_u32x8([1, 2, 3, 5, 0, 0, 0, 0]),
+    from_u32x8([0, 1, 2, 3, 5, 0, 0, 0]),
+    from_u32x8([4, 5, 0, 0, 0, 0, 0, 0]),
+    from_u32x8([0, 4, 5, 0, 0, 0, 0, 0]),
+    from_u32x8([1, 4, 5, 0, 0, 0, 0, 0]),
+    from_u32x8([0, 1, 4, 5, 0, 0, 0, 0]),
+    from_u32x8([2, 4, 5, 0, 0, 0, 0, 0]),
+    from_u32x8([0, 2, 4, 5, 0, 0, 0, 0]),
+    from_u32x8([1, 2, 4, 5, 0, 0, 0, 0]),
+    from_u32x8([0, 1, 2, 4, 5, 0, 0, 0]),
+    from_u32x8([3, 4, 5, 0, 0, 0, 0, 0]),
+    from_u32x8([0, 3, 4, 5, 0, 0, 0, 0]),
+    from_u32x8([1, 3, 4, 5, 0, 0, 0, 0]),
+    from_u32x8([0, 1, 3, 4, 5, 0, 0, 0]),
+    from_u32x8([2, 3, 4, 5, 0, 0, 0, 0]),
+    from_u32x8([0, 2, 3, 4, 5, 0, 0, 0]),
+    from_u32x8([1, 2, 3, 4, 5, 0, 0, 0]),
+    from_u32x8([0, 1, 2, 3, 4, 5, 0, 0]),
+    from_u32x8([6, 0, 0, 0, 0, 0, 0, 0]),
+    from_u32x8([0, 6, 0, 0, 0, 0, 0, 0]),
+    from_u32x8([1, 6, 0, 0, 0, 0, 0, 0]),
+    from_u32x8([0, 1, 6, 0, 0, 0, 0, 0]),
+    from_u32x8([2, 6, 0, 0, 0, 0, 0, 0]),
+    from_u32x8([0, 2, 6, 0, 0, 0, 0, 0]),
+    from_u32x8([1, 2, 6, 0, 0, 0, 0, 0]),
+    from_u32x8([0, 1, 2, 6, 0, 0, 0, 0]),
+    from_u32x8([3, 6, 0, 0, 0, 0, 0, 0]),
+    from_u32x8([0, 3, 6, 0, 0, 0, 0, 0]),
+    from_u32x8([1, 3, 6, 0, 0, 0, 0, 0]),
+    from_u32x8([0, 1, 3, 6, 0, 0, 0, 0]),
+    from_u32x8([2, 3, 6, 0, 0, 0, 0, 0]),
+    from_u32x8([0, 2, 3, 6, 0, 0, 0, 0]),
+    from_u32x8([1, 2, 3, 6, 0, 0, 0, 0]),
+    from_u32x8([0, 1, 2, 3, 6, 0, 0, 0]),
+    from_u32x8([4, 6, 0, 0, 0, 0, 0, 0]),
+    from_u32x8([0, 4, 6, 0, 0, 0, 0, 0]),
+    from_u32x8([1, 4, 6, 0, 0, 0, 0, 0]),
+    from_u32x8([0, 1, 4, 6, 0, 0, 0, 0]),
+    from_u32x8([2, 4, 6, 0, 0, 0, 0, 0]),
+    from_u32x8([0, 2, 4, 6, 0, 0, 0, 0]),
+    from_u32x8([1, 2, 4, 6, 0, 0, 0, 0]),
+    from_u32x8([0, 1, 2, 4, 6, 0, 0, 0]),
+    from_u32x8([3, 4, 6, 0, 0, 0, 0, 0]),
+    from_u32x8([0, 3, 4, 6, 0, 0, 0, 0]),
+    from_u32x8([1, 3, 4, 6, 0, 0, 0, 0]),
+    from_u32x8([0, 1, 3, 4, 6, 0, 0, 0]),
+    from_u32x8([2, 3, 4, 6, 0, 0, 0, 0]),
+    from_u32x8([0, 2, 3, 4, 6, 0, 0, 0]),
+    from_u32x8([1, 2, 3, 4, 6, 0, 0, 0]),
+    from_u32x8([0, 1, 2, 3, 4, 6, 0, 0]),
+    from_u32x8([5, 6, 0, 0, 0, 0, 0, 0]),
+    from_u32x8([0, 5, 6, 0, 0, 0, 0, 0]),
+    from_u32x8([1, 5, 6, 0, 0, 0, 0, 0]),
+    from_u32x8([0, 1, 5, 6, 0, 0, 0, 0]),
+    from_u32x8([2, 5, 6, 0, 0, 0, 0, 0]),
+    from_u32x8([0, 2, 5, 6, 0, 0, 0, 0]),
+    from_u32x8([1, 2, 5, 6, 0, 0, 0, 0]),
+    from_u32x8([0, 1, 2, 5, 6, 0, 0, 0]),
+    from_u32x8([3, 5, 6, 0, 0, 0, 0, 0]),
+    from_u32x8([0, 3, 5, 6, 0, 0, 0, 0]),
+    from_u32x8([1, 3, 5, 6, 0, 0, 0, 0]),
+    from_u32x8([0, 1, 3, 5, 6, 0, 0, 0]),
+    from_u32x8([2, 3, 5, 6, 0, 0, 0, 0]),
+    from_u32x8([0, 2, 3, 5, 6, 0, 0, 0]),
+    from_u32x8([1, 2, 3, 5, 6, 0, 0, 0]),
+    from_u32x8([0, 1, 2, 3, 5, 6, 0, 0]),
+    from_u32x8([4, 5, 6, 0, 0, 0, 0, 0]),
+    from_u32x8([0, 4, 5, 6, 0, 0, 0, 0]),
+    from_u32x8([1, 4, 5, 6, 0, 0, 0, 0]),
+    from_u32x8([0, 1, 4, 5, 6, 0, 0, 0]),
+    from_u32x8([2, 4, 5, 6, 0, 0, 0, 0]),
+    from_u32x8([0, 2, 4, 5, 6, 0, 0, 0]),
+    from_u32x8([1, 2, 4, 5, 6, 0, 0, 0]),
+    from_u32x8([0, 1, 2, 4, 5, 6, 0, 0]),
+    from_u32x8([3, 4, 5, 6, 0, 0, 0, 0]),
+    from_u32x8([0, 3, 4, 5, 6, 0, 0, 0]),
+    from_u32x8([1, 3, 4, 5, 6, 0, 0, 0]),
+    from_u32x8([0, 1, 3, 4, 5, 6, 0, 0]),
+    from_u32x8([2, 3, 4, 5, 6, 0, 0, 0]),
+    from_u32x8([0, 2, 3, 4, 5, 6, 0, 0]),
+    from_u32x8([1, 2, 3, 4, 5, 6, 0, 0]),
+    from_u32x8([0, 1, 2, 3, 4, 5, 6, 0]),
+    from_u32x8([7, 0, 0, 0, 0, 0, 0, 0]),
+    from_u32x8([0, 7, 0, 0, 0, 0, 0, 0]),
+    from_u32x8([1, 7, 0, 0, 0, 0, 0, 0]),
+    from_u32x8([0, 1, 7, 0, 0, 0, 0, 0]),
+    from_u32x8([2, 7, 0, 0, 0, 0, 0, 0]),
+    from_u32x8([0, 2, 7, 0, 0, 0, 0, 0]),
+    from_u32x8([1, 2, 7, 0, 0, 0, 0, 0]),
+    from_u32x8([0, 1, 2, 7, 0, 0, 0, 0]),
+    from_u32x8([3, 7, 0, 0, 0, 0, 0, 0]),
+    from_u32x8([0, 3, 7, 0, 0, 0, 0, 0]),
+    from_u32x8([1, 3, 7, 0, 0, 0, 0, 0]),
+    from_u32x8([0, 1, 3, 7, 0, 0, 0, 0]),
+    from_u32x8([2, 3, 7, 0, 0, 0, 0, 0]),
+    from_u32x8([0, 2, 3, 7, 0, 0, 0, 0]),
+    from_u32x8([1, 2, 3, 7, 0, 0, 0, 0]),
+    from_u32x8([0, 1, 2, 3, 7, 0, 0, 0]),
+    from_u32x8([4, 7, 0, 0, 0, 0, 0, 0]),
+    from_u32x8([0, 4, 7, 0, 0, 0, 0, 0]),
+    from_u32x8([1, 4, 7, 0, 0, 0, 0, 0]),
+    from_u32x8([0, 1, 4, 7, 0, 0, 0, 0]),
+    from_u32x8([2, 4, 7, 0, 0, 0, 0, 0]),
+    from_u32x8([0, 2, 4, 7, 0, 0, 0, 0]),
+    from_u32x8([1, 2, 4, 7, 0, 0, 0, 0]),
+    from_u32x8([0, 1, 2, 4, 7, 0, 0, 0]),
+    from_u32x8([3, 4, 7, 0, 0, 0, 0, 0]),
+    from_u32x8([0, 3, 4, 7, 0, 0, 0, 0]),
+    from_u32x8([1, 3, 4, 7, 0, 0, 0, 0]),
+    from_u32x8([0, 1, 3, 4, 7, 0, 0, 0]),
+    from_u32x8([2, 3, 4, 7, 0, 0, 0, 0]),
+    from_u32x8([0, 2, 3, 4, 7, 0, 0, 0]),
+    from_u32x8([1, 2, 3, 4, 7, 0, 0, 0]),
+    from_u32x8([0, 1, 2, 3, 4, 7, 0, 0]),
+    from_u32x8([5, 7, 0, 0, 0, 0, 0, 0]),
+    from_u32x8([0, 5, 7, 0, 0, 0, 0, 0]),
+    from_u32x8([1, 5, 7, 0, 0, 0, 0, 0]),
+    from_u32x8([0, 1, 5, 7, 0, 0, 0, 0]),
+    from_u32x8([2, 5, 7, 0, 0, 0, 0, 0]),
+    from_u32x8([0, 2, 5, 7, 0, 0, 0, 0]),
+    from_u32x8([1, 2, 5, 7, 0, 0, 0, 0]),
+    from_u32x8([0, 1, 2, 5, 7, 0, 0, 0]),
+    from_u32x8([3, 5, 7, 0, 0, 0, 0, 0]),
+    from_u32x8([0, 3, 5, 7, 0, 0, 0, 0]),
+    from_u32x8([1, 3, 5, 7, 0, 0, 0, 0]),
+    from_u32x8([0, 1, 3, 5, 7, 0, 0, 0]),
+    from_u32x8([2, 3, 5, 7, 0, 0, 0, 0]),
+    from_u32x8([0, 2, 3, 5, 7, 0, 0, 0]),
+    from_u32x8([1, 2, 3, 5, 7, 0, 0, 0]),
+    from_u32x8([0, 1, 2, 3, 5, 7, 0, 0]),
+    from_u32x8([4, 5, 7, 0, 0, 0, 0, 0]),
+    from_u32x8([0, 4, 5, 7, 0, 0, 0, 0]),
+    from_u32x8([1, 4, 5, 7, 0, 0, 0, 0]),
+    from_u32x8([0, 1, 4, 5, 7, 0, 0, 0]),
+    from_u32x8([2, 4, 5, 7, 0, 0, 0, 0]),
+    from_u32x8([0, 2, 4, 5, 7, 0, 0, 0]),
+    from_u32x8([1, 2, 4, 5, 7, 0, 0, 0]),
+    from_u32x8([0, 1, 2, 4, 5, 7, 0, 0]),
+    from_u32x8([3, 4, 5, 7, 0, 0, 0, 0]),
+    from_u32x8([0, 3, 4, 5, 7, 0, 0, 0]),
+    from_u32x8([1, 3, 4, 5, 7, 0, 0, 0]),
+    from_u32x8([0, 1, 3, 4, 5, 7, 0, 0]),
+    from_u32x8([2, 3, 4, 5, 7, 0, 0, 0]),
+    from_u32x8([0, 2, 3, 4, 5, 7, 0, 0]),
+    from_u32x8([1, 2, 3, 4, 5, 7, 0, 0]),
+    from_u32x8([0, 1, 2, 3, 4, 5, 7, 0]),
+    from_u32x8([6, 7, 0, 0, 0, 0, 0, 0]),
+    from_u32x8([0, 6, 7, 0, 0, 0, 0, 0]),
+    from_u32x8([1, 6, 7, 0, 0, 0, 0, 0]),
+    from_u32x8([0, 1, 6, 7, 0, 0, 0, 0]),
+    from_u32x8([2, 6, 7, 0, 0, 0, 0, 0]),
+    from_u32x8([0, 2, 6, 7, 0, 0, 0, 0]),
+    from_u32x8([1, 2, 6, 7, 0, 0, 0, 0]),
+    from_u32x8([0, 1, 2, 6, 7, 0, 0, 0]),
+    from_u32x8([3, 6, 7, 0, 0, 0, 0, 0]),
+    from_u32x8([0, 3, 6, 7, 0, 0, 0, 0]),
+    from_u32x8([1, 3, 6, 7, 0, 0, 0, 0]),
+    from_u32x8([0, 1, 3, 6, 7, 0, 0, 0]),
+    from_u32x8([2, 3, 6, 7, 0, 0, 0, 0]),
+    from_u32x8([0, 2, 3, 6, 7, 0, 0, 0]),
+    from_u32x8([1, 2, 3, 6, 7, 0, 0, 0]),
+    from_u32x8([0, 1, 2, 3, 6, 7, 0, 0]),
+    from_u32x8([4, 6, 7, 0, 0, 0, 0, 0]),
+    from_u32x8([0, 4, 6, 7, 0, 0, 0, 0]),
+    from_u32x8([1, 4, 6, 7, 0, 0, 0, 0]),
+    from_u32x8([0, 1, 4, 6, 7, 0, 0, 0]),
+    from_u32x8([2, 4, 6, 7, 0, 0, 0, 0]),
+    from_u32x8([0, 2, 4, 6, 7, 0, 0, 0]),
+    from_u32x8([1, 2, 4, 6, 7, 0, 0, 0]),
+    from_u32x8([0, 1, 2, 4, 6, 7, 0, 0]),
+    from_u32x8([3, 4, 6, 7, 0, 0, 0, 0]),
+    from_u32x8([0, 3, 4, 6, 7, 0, 0, 0]),
+    from_u32x8([1, 3, 4, 6, 7, 0, 0, 0]),
+    from_u32x8([0, 1, 3, 4, 6, 7, 0, 0]),
+    from_u32x8([2, 3, 4, 6, 7, 0, 0, 0]),
+    from_u32x8([0, 2, 3, 4, 6, 7, 0, 0]),
+    from_u32x8([1, 2, 3, 4, 6, 7, 0, 0]),
+    from_u32x8([0, 1, 2, 3, 4, 6, 7, 0]),
+    from_u32x8([5, 6, 7, 0, 0, 0, 0, 0]),
+    from_u32x8([0, 5, 6, 7, 0, 0, 0, 0]),
+    from_u32x8([1, 5, 6, 7, 0, 0, 0, 0]),
+    from_u32x8([0, 1, 5, 6, 7, 0, 0, 0]),
+    from_u32x8([2, 5, 6, 7, 0, 0, 0, 0]),
+    from_u32x8([0, 2, 5, 6, 7, 0, 0, 0]),
+    from_u32x8([1, 2, 5, 6, 7, 0, 0, 0]),
+    from_u32x8([0, 1, 2, 5, 6, 7, 0, 0]),
+    from_u32x8([3, 5, 6, 7, 0, 0, 0, 0]),
+    from_u32x8([0, 3, 5, 6, 7, 0, 0, 0]),
+    from_u32x8([1, 3, 5, 6, 7, 0, 0, 0]),
+    from_u32x8([0, 1, 3, 5, 6, 7, 0, 0]),
+    from_u32x8([2, 3, 5, 6, 7, 0, 0, 0]),
+    from_u32x8([0, 2, 3, 5, 6, 7, 0, 0]),
+    from_u32x8([1, 2, 3, 5, 6, 7, 0, 0]),
+    from_u32x8([0, 1, 2, 3, 5, 6, 7, 0]),
+    from_u32x8([4, 5, 6, 7, 0, 0, 0, 0]),
+    from_u32x8([0, 4, 5, 6, 7, 0, 0, 0]),
+    from_u32x8([1, 4, 5, 6, 7, 0, 0, 0]),
+    from_u32x8([0, 1, 4, 5, 6, 7, 0, 0]),
+    from_u32x8([2, 4, 5, 6, 7, 0, 0, 0]),
+    from_u32x8([0, 2, 4, 5, 6, 7, 0, 0]),
+    from_u32x8([1, 2, 4, 5, 6, 7, 0, 0]),
+    from_u32x8([0, 1, 2, 4, 5, 6, 7, 0]),
+    from_u32x8([3, 4, 5, 6, 7, 0, 0, 0]),
+    from_u32x8([0, 3, 4, 5, 6, 7, 0, 0]),
+    from_u32x8([1, 3, 4, 5, 6, 7, 0, 0]),
+    from_u32x8([0, 1, 3, 4, 5, 6, 7, 0]),
+    from_u32x8([2, 3, 4, 5, 6, 7, 0, 0]),
+    from_u32x8([0, 2, 3, 4, 5, 6, 7, 0]),
+    from_u32x8([1, 2, 3, 4, 5, 6, 7, 0]),
+    from_u32x8([0, 1, 2, 3, 4, 5, 6, 7]),
+];
--- a/bitpacker/src/filter_vec/mod.rs
+++ b/bitpacker/src/filter_vec/mod.rs
@@ -0,0 +1,165 @@
+use std::ops::RangeInclusive;
+
+#[cfg(target_arch = "x86_64")]
+mod avx2;
+
+mod scalar;
+
+#[derive(Clone, Copy, Eq, PartialEq, Debug)]
+#[repr(u8)]
+enum FilterImplPerInstructionSet {
+    #[cfg(target_arch = "x86_64")]
+    AVX2 = 0u8,
+    Scalar = 1u8,
+}
+
+impl FilterImplPerInstructionSet {
+    #[inline]
+    pub fn is_available(&self) -> bool {
+        match *self {
+            #[cfg(target_arch = "x86_64")]
+            FilterImplPerInstructionSet::AVX2 => is_x86_feature_detected!("avx2"),
+            FilterImplPerInstructionSet::Scalar => true,
+        }
+    }
+}
+
+// List of available implementation in preferred order.
+#[cfg(target_arch = "x86_64")]
+const IMPLS: [FilterImplPerInstructionSet; 2] = [
+    FilterImplPerInstructionSet::AVX2,
+    FilterImplPerInstructionSet::Scalar,
+];
+
+#[cfg(not(target_arch = "x86_64"))]
+const IMPLS: [FilterImplPerInstructionSet; 1] = [FilterImplPerInstructionSet::Scalar];
+
+impl FilterImplPerInstructionSet {
+    #[allow(unused_variables)]
+    #[inline]
+    fn from(code: u8) -> FilterImplPerInstructionSet {
+        #[cfg(target_arch = "x86_64")]
+        if code == FilterImplPerInstructionSet::AVX2 as u8 {
+            return FilterImplPerInstructionSet::AVX2;
+        }
+        FilterImplPerInstructionSet::Scalar
+    }
+
+    #[inline]
+    fn filter_vec_in_place(self, range: RangeInclusive<u32>, offset: u32, output: &mut Vec<u32>) {
+        match self {
+            #[cfg(target_arch = "x86_64")]
+            FilterImplPerInstructionSet::AVX2 => avx2::filter_vec_in_place(range, offset, output),
+            FilterImplPerInstructionSet::Scalar => {
+                scalar::filter_vec_in_place(range, offset, output)
+            }
+        }
+    }
+}
+
+#[inline]
+fn get_best_available_instruction_set() -> FilterImplPerInstructionSet {
+    use std::sync::atomic::{AtomicU8, Ordering};
+    static INSTRUCTION_SET_BYTE: AtomicU8 = AtomicU8::new(u8::MAX);
+    let instruction_set_byte: u8 = INSTRUCTION_SET_BYTE.load(Ordering::Relaxed);
+    if instruction_set_byte == u8::MAX {
+        // Let's initialize the instruction set and cache it.
+        let instruction_set = IMPLS
+            .into_iter()
+            .find(FilterImplPerInstructionSet::is_available)
+            .unwrap();
+        INSTRUCTION_SET_BYTE.store(instruction_set as u8, Ordering::Relaxed);
+        return instruction_set;
+    }
+    FilterImplPerInstructionSet::from(instruction_set_byte)
+}
+
+pub fn filter_vec_in_place(range: RangeInclusive<u32>, offset: u32, output: &mut Vec<u32>) {
+    get_best_available_instruction_set().filter_vec_in_place(range, offset, output)
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn test_get_best_available_instruction_set() {
+        // This does not test much unfortunately.
+        // We just make sure the function returns without crashing and returns the same result.
+        let instruction_set = get_best_available_instruction_set();
+        assert_eq!(get_best_available_instruction_set(), instruction_set);
+    }
+
+    #[cfg(target_arch = "x86_64")]
+    #[test]
+    fn test_instruction_set_to_code_from_code() {
+        for instruction_set in [
+            FilterImplPerInstructionSet::AVX2,
+            FilterImplPerInstructionSet::Scalar,
+        ] {
+            let code = instruction_set as u8;
+            assert_eq!(instruction_set, FilterImplPerInstructionSet::from(code));
+        }
+    }
+
+    fn test_filter_impl_empty_aux(filter_impl: FilterImplPerInstructionSet) {
+        let mut output = vec![];
+        filter_impl.filter_vec_in_place(0..=u32::MAX, 0, &mut output);
+        assert_eq!(&output, &[]);
+    }
+
+    fn test_filter_impl_simple_aux(filter_impl: FilterImplPerInstructionSet) {
+        let mut output = vec![3, 2, 1, 5, 11, 2, 5, 10, 2];
+        filter_impl.filter_vec_in_place(3..=10, 0, &mut output);
+        assert_eq!(&output, &[0, 3, 6, 7]);
+    }
+
+    fn test_filter_impl_simple_aux_shifted(filter_impl: FilterImplPerInstructionSet) {
+        let mut output = vec![3, 2, 1, 5, 11, 2, 5, 10, 2];
+        filter_impl.filter_vec_in_place(3..=10, 10, &mut output);
+        assert_eq!(&output, &[10, 13, 16, 17]);
+    }
+
+    fn test_filter_impl_simple_outside_i32_range(filter_impl: FilterImplPerInstructionSet) {
+        let mut output = vec![u32::MAX, i32::MAX as u32 + 1, 0, 1, 3, 1, 1, 1, 1];
+        filter_impl.filter_vec_in_place(1..=i32::MAX as u32 + 1u32, 0, &mut output);
+        assert_eq!(&output, &[1, 3, 4, 5, 6, 7, 8]);
+    }
+
+    fn test_filter_impl_test_suite(filter_impl: FilterImplPerInstructionSet) {
+        test_filter_impl_empty_aux(filter_impl);
+        test_filter_impl_simple_aux(filter_impl);
+        test_filter_impl_simple_aux_shifted(filter_impl);
+        test_filter_impl_simple_outside_i32_range(filter_impl);
+    }
+
+    #[test]
+    #[cfg(target_arch = "x86_64")]
+    fn test_filter_implementation_avx2() {
+        if FilterImplPerInstructionSet::AVX2.is_available() {
+            test_filter_impl_test_suite(FilterImplPerInstructionSet::AVX2);
+        }
+    }
+
+    #[test]
+    fn test_filter_implementation_scalar() {
+        test_filter_impl_test_suite(FilterImplPerInstructionSet::Scalar);
+    }
+
+    #[cfg(target_arch = "x86_64")]
+    proptest::proptest! {
+        #[test]
+        fn test_filter_compare_scalar_and_avx2_impl_proptest(
+            start in proptest::prelude::any::<u32>(),
+            end in proptest::prelude::any::<u32>(),
+            offset in 0u32..2u32,
+            mut vals in proptest::collection::vec(0..u32::MAX, 0..30)) {
+            if FilterImplPerInstructionSet::AVX2.is_available() {
+                let mut vals_clone = vals.clone();
+                FilterImplPerInstructionSet::AVX2.filter_vec_in_place(start..=end, offset, &mut vals);
+                FilterImplPerInstructionSet::Scalar.filter_vec_in_place(start..=end, offset, &mut vals_clone);
+                assert_eq!(&vals, &vals_clone);
+            }
+       }
+    }
+}
--- a/bitpacker/src/filter_vec/scalar.rs
+++ b/bitpacker/src/filter_vec/scalar.rs
@@ -0,0 +1,13 @@
+use std::ops::RangeInclusive;
+
+pub fn filter_vec_in_place(range: RangeInclusive<u32>, offset: u32, output: &mut Vec<u32>) {
+    // We restrict the accepted boundary, because unsigned integers & SIMD don't
+    // play well.
+    let mut output_cursor = 0;
+    for i in 0..output.len() {
+        let val = output[i];
+        output[output_cursor] = offset + i as u32;
+        output_cursor += if range.contains(&val) { 1 } else { 0 };
+    }
+    output.truncate(output_cursor);
+}
--- a/bitpacker/src/lib.rs
+++ b/bitpacker/src/lib.rs
@@ -1,5 +1,6 @@
 mod bitpacker;
 mod blocked_bitpacker;
+mod filter_vec;

 use std::cmp::Ordering;

--- a/ci/before_deploy.ps1
+++ b/ci/before_deploy.ps1
@@ -1,23 +0,0 @@
-# This script takes care of packaging the build artifacts that will go in the
-# release zipfile
-
-$SRC_DIR = $PWD.Path
-$STAGE = [System.Guid]::NewGuid().ToString()
-
-Set-Location $ENV:Temp
-New-Item -Type Directory -Name $STAGE
-Set-Location $STAGE
-
-$ZIP = "$SRC_DIR\$($Env:CRATE_NAME)-$($Env:APPVEYOR_REPO_TAG_NAME)-$($Env:TARGET).zip"
-
-# TODO Update this to package the right artifacts
-Copy-Item "$SRC_DIR\target\$($Env:TARGET)\release\hello.exe" '.\'
-
-7z a "$ZIP" *
-
-Push-AppveyorArtifact "$ZIP"
-
-Remove-Item *.* -Force
-Set-Location ..
-Remove-Item $STAGE
-Set-Location $SRC_DIR
--- a/ci/before_deploy.sh
+++ b/ci/before_deploy.sh
@@ -1,33 +0,0 @@
-# This script takes care of building your crate and packaging it for release
-
-set -ex
-
-main() {
-    local src=$(pwd) \
-          stage=
-
-    case $TRAVIS_OS_NAME in
-        linux)
-            stage=$(mktemp -d)
-            ;;
-        osx)
-            stage=$(mktemp -d -t tmp)
-            ;;
-    esac
-
-    test -f Cargo.lock || cargo generate-lockfile
-
-    # TODO Update this to build the artifacts that matter to you
-    cross rustc --bin hello --target $TARGET --release -- -C lto
-
-    # TODO Update this to package the right artifacts
-    cp target/$TARGET/release/hello $stage/
-
-    cd $stage
-    tar czf $src/$CRATE_NAME-$TRAVIS_TAG-$TARGET.tar.gz *
-    cd $src
-
-    rm -rf $stage
-}
-
-main
--- a/ci/install.sh
+++ b/ci/install.sh
@@ -1,47 +0,0 @@
-set -ex
-
-main() {
-    local target=
-    if [ $TRAVIS_OS_NAME = linux ]; then
-        target=x86_64-unknown-linux-musl
-        sort=sort
-    else
-        target=x86_64-apple-darwin
-        sort=gsort  # for `sort --sort-version`, from brew's coreutils.
-    fi
-
-    # Builds for iOS are done on OSX, but require the specific target to be
-    # installed.
-    case $TARGET in
-        aarch64-apple-ios)
-            rustup target install aarch64-apple-ios
-            ;;
-        armv7-apple-ios)
-            rustup target install armv7-apple-ios
-            ;;
-        armv7s-apple-ios)
-            rustup target install armv7s-apple-ios
-            ;;
-        i386-apple-ios)
-            rustup target install i386-apple-ios
-            ;;
-        x86_64-apple-ios)
-            rustup target install x86_64-apple-ios
-            ;;
-    esac
-
-    # This fetches latest stable release
-    local tag=$(git ls-remote --tags --refs --exit-code https://github.com/japaric/cross \
-                       | cut -d/ -f3 \
-                       | grep -E '^v[0.1.0-9.]+$' \
-                       | $sort --version-sort \
-                       | tail -n1)
-    curl -LSfs https://japaric.github.io/trust/install.sh | \
-        sh -s -- \
-           --force \
-           --git japaric/cross \
-           --tag $tag \
-           --target $target
-}
-
-main
--- a/ci/script.sh
+++ b/ci/script.sh
@@ -1,30 +0,0 @@
-#!/usr/bin/env bash
-
-# This script takes care of testing your crate
-
-set -ex
-
-main() {
-    if [ ! -z $CODECOV ]; then
-        echo "Codecov"
-        cargo build --verbose && cargo coverage --verbose --all && bash <(curl -s https://codecov.io/bash) -s target/kcov
-    else
-        echo "Build"
-        cross build --target $TARGET
-        if [ ! -z $DISABLE_TESTS ]; then
-            return
-        fi
-        echo "Test"
-        cross test --target $TARGET --no-default-features --features mmap
-        cross test --target $TARGET --no-default-features --features mmap query-grammar
-    fi
-    for example in $(ls examples/*.rs)
-    do
-        cargo run --example  $(basename $example .rs)
-    done
-}
-
-# we don't run the "test phase" when doing deploys
-if [ -z $TRAVIS_TAG ]; then
-    main
-fi
--- a/cliff.toml
+++ b/cliff.toml
@@ -0,0 +1,90 @@
+# configuration file for git-cliff{ pattern = "foo", replace = "bar"}
+# see https://github.com/orhun/git-cliff#configuration-file
+
+[changelog]
+# changelog header
+header = """
+"""
+# template for the changelog body
+# https://tera.netlify.app/docs/#introduction
+body = """
+{% if version %}\
+    {{ version | trim_start_matches(pat="v") }} ({{ timestamp | date(format="%Y-%m-%d") }})
+    ==================
+{% else %}\
+    ## [unreleased]
+{% endif %}\
+{% for commit in commits %}
+    - {% if commit.breaking %}[**breaking**] {% endif %}{{ commit.message | split(pat="\n") | first | trim | upper_first }}(@{{ commit.author.name }})\
+{% endfor %}
+"""
+# remove the leading and trailing whitespace from the template
+trim = true
+# changelog footer
+footer = """
+"""
+
+postprocessors = [
+    { pattern = 'Paul Masurel', replace = "fulmicoton"}, # replace with github user
+    { pattern = 'PSeitz', replace = "PSeitz"}, # replace with github user
+    { pattern = 'Adam Reichold', replace = "adamreichold"}, # replace with github user
+    { pattern = 'trinity-1686a', replace = "trinity-1686a"}, # replace with github user
+    { pattern = 'Michael Kleen', replace = "mkleen"}, # replace with github user
+    { pattern = 'Adrien Guillo', replace = "guilload"}, # replace with github user
+    { pattern = 'François Massot', replace = "fmassot"}, # replace with github user
+    { pattern = 'Naveen Aiathurai', replace = "naveenann"}, # replace with github user
+    { pattern = '', replace = ""}, # replace with github user
+]
+
+[git]
+# parse the commits based on https://www.conventionalcommits.org
+# This is required or commit.message contains the whole commit message and not just the title
+conventional_commits = true
+# filter out the commits that are not conventional
+filter_unconventional = false
+# process each line of a commit as an individual commit
+split_commits = false
+# regex for preprocessing the commit messages
+commit_preprocessors = [
+    { pattern = '\((\w+\s)?#([0-9]+)\)', replace = "[#${2}](https://github.com/quickwit-oss/tantivy/issues/${2})"}, # replace issue numbers
+]
+#link_parsers = [
+    #{ pattern = "#(\\d+)", href = "https://github.com/quickwit-oss/tantivy/pulls/$1"},
+#]
+# regex for parsing and grouping commits
+commit_parsers = [
+    { message = "^feat", group = "Features"},
+    { message = "^fix", group = "Bug Fixes"},
+    { message = "^doc", group = "Documentation"},
+    { message = "^perf", group = "Performance"},
+    { message = "^refactor", group = "Refactor"},
+    { message = "^style", group = "Styling"},
+    { message = "^test", group = "Testing"},
+    { message = "^chore\\(release\\): prepare for", skip = true},
+    { message = "(?i)clippy", skip = true},
+    { message = "(?i)dependabot", skip = true},
+    { message = "(?i)fmt", skip = true},
+    { message = "(?i)bump", skip = true},
+    { message = "(?i)readme", skip = true},
+    { message = "(?i)comment", skip = true},
+    { message = "(?i)spelling", skip = true},
+    { message = "^chore", group = "Miscellaneous Tasks"},
+    { body = ".*security", group = "Security"},
+    { message = ".*", group = "Other", default_scope = "other"},
+]
+# protect breaking changes from being skipped due to matching a skipping commit_parser
+protect_breaking_commits = false
+# filter out the commits that are not matched by commit parsers
+filter_commits = false
+# glob pattern for matching git tags
+tag_pattern = "v[0-9]*"
+# regex for skipping tags
+skip_tags = "v0.1.0-beta.1"
+# regex for ignoring tags
+ignore_tags = ""
+# sort the tags topologically
+topo_order = false
+# sort the commits inside sections by oldest/newest order
+sort_commits = "newest"
+# limit the number of commits included in the changelog.
+# limit_commits = 42
--- a/columnar/Cargo.toml
+++ b/columnar/Cargo.toml
@@ -1,27 +1,28 @@
 [package]
 name = "tantivy-columnar"
-version = "0.1.0"
+version = "0.2.0"
 edition = "2021"
 license = "MIT"
+homepage = "https://github.com/quickwit-oss/tantivy"
+repository = "https://github.com/quickwit-oss/tantivy"
+description = "column oriented storage for tantivy"
+categories = ["database-implementations", "data-structures", "compression"]

 [dependencies]
-itertools = "0.10.5"
-log = "0.4.17"
+itertools = "0.11.0"
 fnv = "1.0.7"
 fastdivide = "0.4.0"
-rand = { version = "0.8.5", optional = true }
-measure_time = { version = "0.8.2", optional = true }
-prettytable-rs = { version = "0.10.0", optional = true }

-stacker = { path = "../stacker", package="tantivy-stacker"}
-sstable = { path = "../sstable", package = "tantivy-sstable" }
-common = { path = "../common", package = "tantivy-common" }
-tantivy-bitpacker = { version= "0.3", path = "../bitpacker/" }
+stacker = { version= "0.2", path = "../stacker", package="tantivy-stacker"}
+sstable = { version= "0.2", path = "../sstable", package = "tantivy-sstable" }
+common = { version= "0.6", path = "../common", package = "tantivy-common" }
+tantivy-bitpacker = { version= "0.5", path = "../bitpacker/" }
+serde = "1.0.152"

 [dev-dependencies]
-proptest = "1.0.0"
+proptest = "1"
 more-asserts = "0.3.1"
-rand = "0.8.5"
+rand = "0.8"

 [features]
 unstable = []
--- a/columnar/benches/bench_u128.rs
+++ b/columnar/benches/bench_u128.rs
@@ -0,0 +1,124 @@
+#![feature(test)]
+
+use std::ops::RangeInclusive;
+use std::sync::Arc;
+
+use common::OwnedBytes;
+use rand::rngs::StdRng;
+use rand::seq::SliceRandom;
+use rand::{random, Rng, SeedableRng};
+use tantivy_columnar::ColumnValues;
+use test::Bencher;
+extern crate test;
+
+// TODO does this make sense for IPv6 ?
+fn generate_random() -> Vec<u64> {
+    let mut permutation: Vec<u64> = (0u64..100_000u64)
+        .map(|el| el + random::<u16>() as u64)
+        .collect();
+    permutation.shuffle(&mut StdRng::from_seed([1u8; 32]));
+    permutation
+}
+
+fn get_u128_column_random() -> Arc<dyn ColumnValues<u128>> {
+    let permutation = generate_random();
+    let permutation = permutation.iter().map(|el| *el as u128).collect::<Vec<_>>();
+    get_u128_column_from_data(&permutation)
+}
+
+fn get_u128_column_from_data(data: &[u128]) -> Arc<dyn ColumnValues<u128>> {
+    let mut out = vec![];
+    tantivy_columnar::column_values::serialize_column_values_u128(&data, &mut out).unwrap();
+    let out = OwnedBytes::new(out);
+    tantivy_columnar::column_values::open_u128_mapped::<u128>(out).unwrap()
+}
+
+const FIFTY_PERCENT_RANGE: RangeInclusive<u64> = 1..=50;
+const SINGLE_ITEM: u64 = 90;
+const SINGLE_ITEM_RANGE: RangeInclusive<u64> = 90..=90;
+
+fn get_data_50percent_item() -> Vec<u128> {
+    let mut rng = StdRng::from_seed([1u8; 32]);
+
+    let mut data = vec![];
+    for _ in 0..300_000 {
+        let val = rng.gen_range(1..=100);
+        data.push(val);
+    }
+    data.push(SINGLE_ITEM);
+    data.shuffle(&mut rng);
+    let data = data.iter().map(|el| *el as u128).collect::<Vec<_>>();
+    data
+}
+
+#[bench]
+fn bench_intfastfield_getrange_u128_50percent_hit(b: &mut Bencher) {
+    let data = get_data_50percent_item();
+    let column = get_u128_column_from_data(&data);
+
+    b.iter(|| {
+        let mut positions = Vec::new();
+        column.get_row_ids_for_value_range(
+            *FIFTY_PERCENT_RANGE.start() as u128..=*FIFTY_PERCENT_RANGE.end() as u128,
+            0..data.len() as u32,
+            &mut positions,
+        );
+        positions
+    });
+}
+
+#[bench]
+fn bench_intfastfield_getrange_u128_single_hit(b: &mut Bencher) {
+    let data = get_data_50percent_item();
+    let column = get_u128_column_from_data(&data);
+
+    b.iter(|| {
+        let mut positions = Vec::new();
+        column.get_row_ids_for_value_range(
+            *SINGLE_ITEM_RANGE.start() as u128..=*SINGLE_ITEM_RANGE.end() as u128,
+            0..data.len() as u32,
+            &mut positions,
+        );
+        positions
+    });
+}
+
+#[bench]
+fn bench_intfastfield_getrange_u128_hit_all(b: &mut Bencher) {
+    let data = get_data_50percent_item();
+    let column = get_u128_column_from_data(&data);
+
+    b.iter(|| {
+        let mut positions = Vec::new();
+        column.get_row_ids_for_value_range(0..=u128::MAX, 0..data.len() as u32, &mut positions);
+        positions
+    });
+}
+// U128 RANGE END
+
+#[bench]
+fn bench_intfastfield_scan_all_fflookup_u128(b: &mut Bencher) {
+    let column = get_u128_column_random();
+
+    b.iter(|| {
+        let mut a = 0u128;
+        for i in 0u64..column.num_vals() as u64 {
+            a += column.get_val(i as u32);
+        }
+        a
+    });
+}
+
+#[bench]
+fn bench_intfastfield_jumpy_stride5_u128(b: &mut Bencher) {
+    let column = get_u128_column_random();
+
+    b.iter(|| {
+        let n = column.num_vals();
+        let mut a = 0u128;
+        for i in (0..n / 5).map(|val| val * 5) {
+            a += column.get_val(i);
+        }
+        a
+    });
+}
--- a/columnar/benches/bench_u64.rs
+++ b/columnar/benches/bench_u64.rs
@@ -0,0 +1,211 @@
+#![feature(test)]
+extern crate test;
+
+use std::ops::RangeInclusive;
+use std::sync::Arc;
+
+use rand::prelude::*;
+use tantivy_columnar::column_values::{serialize_and_load_u64_based_column_values, CodecType};
+use tantivy_columnar::*;
+use test::Bencher;
+
+// Warning: this generates the same permutation at each call
+fn generate_permutation() -> Vec<u64> {
+    let mut permutation: Vec<u64> = (0u64..100_000u64).collect();
+    permutation.shuffle(&mut StdRng::from_seed([1u8; 32]));
+    permutation
+}
+
+fn generate_random() -> Vec<u64> {
+    let mut permutation: Vec<u64> = (0u64..100_000u64)
+        .map(|el| el + random::<u16>() as u64)
+        .collect();
+    permutation.shuffle(&mut StdRng::from_seed([1u8; 32]));
+    permutation
+}
+
+// Warning: this generates the same permutation at each call
+fn generate_permutation_gcd() -> Vec<u64> {
+    let mut permutation: Vec<u64> = (1u64..100_000u64).map(|el| el * 1000).collect();
+    permutation.shuffle(&mut StdRng::from_seed([1u8; 32]));
+    permutation
+}
+
+pub fn serialize_and_load(column: &[u64], codec_type: CodecType) -> Arc<dyn ColumnValues<u64>> {
+    serialize_and_load_u64_based_column_values(&column, &[codec_type])
+}
+
+#[bench]
+fn bench_intfastfield_jumpy_veclookup(b: &mut Bencher) {
+    let permutation = generate_permutation();
+    let n = permutation.len();
+    b.iter(|| {
+        let mut a = 0u64;
+        for _ in 0..n {
+            a = permutation[a as usize];
+        }
+        a
+    });
+}
+
+#[bench]
+fn bench_intfastfield_jumpy_fflookup_bitpacked(b: &mut Bencher) {
+    let permutation = generate_permutation();
+    let n = permutation.len();
+    let column: Arc<dyn ColumnValues<u64>> = serialize_and_load(&permutation, CodecType::Bitpacked);
+    b.iter(|| {
+        let mut a = 0u64;
+        for _ in 0..n {
+            a = column.get_val(a as u32);
+        }
+        a
+    });
+}
+
+const FIFTY_PERCENT_RANGE: RangeInclusive<u64> = 1..=50;
+const SINGLE_ITEM: u64 = 90;
+const SINGLE_ITEM_RANGE: RangeInclusive<u64> = 90..=90;
+const ONE_PERCENT_ITEM_RANGE: RangeInclusive<u64> = 49..=49;
+fn get_data_50percent_item() -> Vec<u128> {
+    let mut rng = StdRng::from_seed([1u8; 32]);
+
+    let mut data = vec![];
+    for _ in 0..300_000 {
+        let val = rng.gen_range(1..=100);
+        data.push(val);
+    }
+    data.push(SINGLE_ITEM);
+
+    data.shuffle(&mut rng);
+    let data = data.iter().map(|el| *el as u128).collect::<Vec<_>>();
+    data
+}
+
+// U64 RANGE START
+#[bench]
+fn bench_intfastfield_getrange_u64_50percent_hit(b: &mut Bencher) {
+    let data = get_data_50percent_item();
+    let data = data.iter().map(|el| *el as u64).collect::<Vec<_>>();
+    let column: Arc<dyn ColumnValues<u64>> = serialize_and_load(&data, CodecType::Bitpacked);
+    b.iter(|| {
+        let mut positions = Vec::new();
+        column.get_row_ids_for_value_range(
+            FIFTY_PERCENT_RANGE,
+            0..data.len() as u32,
+            &mut positions,
+        );
+        positions
+    });
+}
+
+#[bench]
+fn bench_intfastfield_getrange_u64_1percent_hit(b: &mut Bencher) {
+    let data = get_data_50percent_item();
+    let data = data.iter().map(|el| *el as u64).collect::<Vec<_>>();
+    let column: Arc<dyn ColumnValues<u64>> = serialize_and_load(&data, CodecType::Bitpacked);
+
+    b.iter(|| {
+        let mut positions = Vec::new();
+        column.get_row_ids_for_value_range(
+            ONE_PERCENT_ITEM_RANGE,
+            0..data.len() as u32,
+            &mut positions,
+        );
+        positions
+    });
+}
+
+#[bench]
+fn bench_intfastfield_getrange_u64_single_hit(b: &mut Bencher) {
+    let data = get_data_50percent_item();
+    let data = data.iter().map(|el| *el as u64).collect::<Vec<_>>();
+    let column: Arc<dyn ColumnValues<u64>> = serialize_and_load(&data, CodecType::Bitpacked);
+
+    b.iter(|| {
+        let mut positions = Vec::new();
+        column.get_row_ids_for_value_range(SINGLE_ITEM_RANGE, 0..data.len() as u32, &mut positions);
+        positions
+    });
+}
+
+#[bench]
+fn bench_intfastfield_getrange_u64_hit_all(b: &mut Bencher) {
+    let data = get_data_50percent_item();
+    let data = data.iter().map(|el| *el as u64).collect::<Vec<_>>();
+    let column: Arc<dyn ColumnValues<u64>> = serialize_and_load(&data, CodecType::Bitpacked);
+
+    b.iter(|| {
+        let mut positions = Vec::new();
+        column.get_row_ids_for_value_range(0..=u64::MAX, 0..data.len() as u32, &mut positions);
+        positions
+    });
+}
+// U64 RANGE END
+
+#[bench]
+fn bench_intfastfield_stride7_vec(b: &mut Bencher) {
+    let permutation = generate_permutation();
+    let n = permutation.len();
+    b.iter(|| {
+        let mut a = 0u64;
+        for i in (0..n / 7).map(|val| val * 7) {
+            a += permutation[i as usize];
+        }
+        a
+    });
+}
+
+#[bench]
+fn bench_intfastfield_stride7_fflookup(b: &mut Bencher) {
+    let permutation = generate_permutation();
+    let n = permutation.len();
+    let column: Arc<dyn ColumnValues<u64>> = serialize_and_load(&permutation, CodecType::Bitpacked);
+    b.iter(|| {
+        let mut a = 0;
+        for i in (0..n / 7).map(|val| val * 7) {
+            a += column.get_val(i as u32);
+        }
+        a
+    });
+}
+
+#[bench]
+fn bench_intfastfield_scan_all_fflookup(b: &mut Bencher) {
+    let permutation = generate_permutation();
+    let n = permutation.len();
+    let column: Arc<dyn ColumnValues<u64>> = serialize_and_load(&permutation, CodecType::Bitpacked);
+    let column_ref = column.as_ref();
+    b.iter(|| {
+        let mut a = 0u64;
+        for i in 0u32..n as u32 {
+            a += column_ref.get_val(i);
+        }
+        a
+    });
+}
+
+#[bench]
+fn bench_intfastfield_scan_all_fflookup_gcd(b: &mut Bencher) {
+    let permutation = generate_permutation_gcd();
+    let n = permutation.len();
+    let column: Arc<dyn ColumnValues<u64>> = serialize_and_load(&permutation, CodecType::Bitpacked);
+    b.iter(|| {
+        let mut a = 0u64;
+        for i in 0..n {
+            a += column.get_val(i as u32);
+        }
+        a
+    });
+}
+
+#[bench]
+fn bench_intfastfield_scan_all_vec(b: &mut Bencher) {
+    let permutation = generate_permutation();
+    b.iter(|| {
+        let mut a = 0u64;
+        for i in 0..permutation.len() {
+            a += permutation[i as usize] as u64;
+        }
+        a
+    });
+}
--- a/columnar/columnar-cli/Cargo.toml
+++ b/columnar/columnar-cli/Cargo.toml
@@ -0,0 +1,17 @@
+[package]
+name = "tantivy-columnar-cli"
+version = "0.1.0"
+edition = "2021"
+license = "MIT"
+
+[dependencies]
+columnar = {path="../", package="tantivy-columnar"}
+serde_json = "1"
+serde_json_borrow = {git="https://github.com/PSeitz/serde_json_borrow/"}
+serde = "1"
+
+[workspace]
+members = []
+
+[profile.release]
+debug = true
--- a/columnar/columnar-cli/src/main.rs
+++ b/columnar/columnar-cli/src/main.rs
@@ -0,0 +1,134 @@
+use columnar::ColumnarWriter;
+use columnar::NumericalValue;
+use serde_json_borrow;
+use std::fs::File;
+use std::io;
+use std::io::BufRead;
+use std::io::BufReader;
+use std::time::Instant;
+
+#[derive(Default)]
+struct JsonStack {
+    path: String,
+    stack: Vec<usize>,
+}
+
+impl JsonStack {
+    fn push(&mut self, seg: &str) {
+        let len = self.path.len();
+        self.stack.push(len);
+        self.path.push('.');
+        self.path.push_str(seg);
+    }
+
+    fn pop(&mut self) {
+        if let Some(len) = self.stack.pop() {
+            self.path.truncate(len);
+        }
+    }
+
+    fn path(&self) -> &str {
+        &self.path[1..]
+    }
+}
+
+fn append_json_to_columnar(
+    doc: u32,
+    json_value: &serde_json_borrow::Value,
+    columnar: &mut ColumnarWriter,
+    stack: &mut JsonStack,
+) -> usize {
+    let mut count = 0;
+    match json_value {
+        serde_json_borrow::Value::Null => {}
+        serde_json_borrow::Value::Bool(val) => {
+            columnar.record_numerical(
+                doc,
+                stack.path(),
+                NumericalValue::from(if *val { 1u64 } else { 0u64 }),
+            );
+            count += 1;
+        }
+        serde_json_borrow::Value::Number(num) => {
+            let numerical_value: NumericalValue = if let Some(num_i64) = num.as_i64() {
+                num_i64.into()
+            } else if let Some(num_u64) = num.as_u64() {
+                num_u64.into()
+            } else if let Some(num_f64) = num.as_f64() {
+                num_f64.into()
+            } else {
+                panic!();
+            };
+            count += 1;
+            columnar.record_numerical(
+                doc,
+                stack.path(),
+                numerical_value,
+            );
+        }
+        serde_json_borrow::Value::Str(msg) => {
+            columnar.record_str(
+                doc,
+                stack.path(),
+                msg,
+            );
+            count += 1;
+        },
+        serde_json_borrow::Value::Array(vals) => {
+            for val in vals {
+                count += append_json_to_columnar(doc, val, columnar, stack);
+            }
+        },
+        serde_json_borrow::Value::Object(json_map) => {
+            for (child_key, child_val) in json_map {
+                stack.push(child_key);
+                count += append_json_to_columnar(doc, child_val, columnar, stack);
+                stack.pop();
+            }
+        },
+    }
+    count
+}
+
+fn main() -> io::Result<()> {
+    let file = File::open("gh_small.json")?;
+    let mut reader = BufReader::new(file);
+    let mut line = String::with_capacity(100);
+    let mut columnar = columnar::ColumnarWriter::default();
+    let mut doc = 0;
+    let start = Instant::now();
+    let mut stack = JsonStack::default();
+    let mut total_count = 0;
+
+    let start_build = Instant::now();
+    loop {
+        line.clear();
+        let len = reader.read_line(&mut line)?;
+        if len == 0 {
+            break;
+        }
+        let Ok(json_value) = serde_json::from_str::<serde_json_borrow::Value>(&line) else { continue; };
+        total_count += append_json_to_columnar(doc, &json_value, &mut columnar, &mut stack);
+        doc += 1;
+    }
+    println!("Build in {:?}", start_build.elapsed());
+
+    println!("value count {total_count}");
+
+    let mut buffer = Vec::new();
+    let start_serialize = Instant::now();
+    columnar.serialize(doc, None, &mut buffer)?;
+    println!("Serialized in {:?}", start_serialize.elapsed());
+    println!("num docs: {doc}, {:?}", start.elapsed());
+    println!("buffer len {} MB", buffer.len() / 1_000_000);
+    let columnar = columnar::ColumnarReader::open(buffer)?;
+    for (column_name, dynamic_column) in columnar.list_columns()? {
+        let num_bytes = dynamic_column.num_bytes();
+        let typ = dynamic_column.column_type();
+        if num_bytes > 1_000_000 {
+            println!("{column_name} {typ:?}  {} KB", num_bytes / 1_000);
+        }
+    }
+    println!("{} columns", columnar.num_columns());
+    Ok(())
+}
--- a/columnar/src/TODO.md
+++ b/columnar/src/TODO.md
@@ -1,22 +1,21 @@
 # zero to one
-* merges
-* full still needs a num_values
-* replug u128
-* add dictionary encoded stuff
-* fix multivalued
-* find a way to make columnar work with strict types
-* plug to tantivy
-    - indexing
-    - aggregations
-    - merge
+
+* revisit line codec
+* add columns from schema on merge
+* Plugging JSON
+* replug examples
+* move datetime to quickwit common
+* switch to nanos
+* reintroduce the gcd map.

 # Perf and Size
+* remove alloc in `ord_to_term`
+ multivaued range queries restrat frm the beginning all of the time.
 * re-add ZSTD compression for dictionaries
 no systematic monotonic mapping
 consider removing multilinear
 f32?
 adhoc solution for bool?
-
 add metrics helper for aggregate. sum(row_id)
 review inline absence/presence
 improv perf of select using PDEP
@@ -36,11 +35,13 @@ use the rank & select naming in unit tests branch.
 multi-linear -> blockwise
 linear codec -> simply a multiplication for the index column
 rename columnar to something more explicit, like column_dictionary or columnar_table
+rename fastfield -> column
+document changes
+rationalization FastFieldValue, HasColumnType
+isolate u128_based and uniform naming

 # Other
 fix enhance column-cli

 # Santa claus
-
 autodetect datetime ipaddr, plug customizable tokenizer.
-
--- a/columnar/src/block_accessor.rs
+++ b/columnar/src/block_accessor.rs
@@ -0,0 +1,132 @@
+use std::cmp::Ordering;
+
+use crate::{Column, DocId, RowId};
+
+#[derive(Debug, Default, Clone)]
+pub struct ColumnBlockAccessor<T> {
+    val_cache: Vec<T>,
+    docid_cache: Vec<DocId>,
+    missing_docids_cache: Vec<DocId>,
+    row_id_cache: Vec<RowId>,
+}
+
+impl<T: PartialOrd + Copy + std::fmt::Debug + Send + Sync + 'static + Default>
+    ColumnBlockAccessor<T>
+{
+    #[inline]
+    pub fn fetch_block(&mut self, docs: &[u32], accessor: &Column<T>) {
+        self.docid_cache.clear();
+        self.row_id_cache.clear();
+        accessor.row_ids_for_docs(docs, &mut self.docid_cache, &mut self.row_id_cache);
+        self.val_cache.resize(self.row_id_cache.len(), T::default());
+        accessor
+            .values
+            .get_vals(&self.row_id_cache, &mut self.val_cache);
+    }
+    #[inline]
+    pub fn fetch_block_with_missing(&mut self, docs: &[u32], accessor: &Column<T>, missing: T) {
+        self.fetch_block(docs, accessor);
+        // We can compare docid_cache with docs to find missing docs
+        if docs.len() != self.docid_cache.len() || accessor.index.is_multivalue() {
+            self.missing_docids_cache.clear();
+            find_missing_docs(docs, &self.docid_cache, |doc| {
+                self.missing_docids_cache.push(doc);
+                self.val_cache.push(missing);
+            });
+            self.docid_cache
+                .extend_from_slice(&self.missing_docids_cache);
+        }
+    }
+
+    #[inline]
+    pub fn iter_vals(&self) -> impl Iterator<Item = T> + '_ {
+        self.val_cache.iter().cloned()
+    }
+
+    #[inline]
+    pub fn iter_docid_vals(&self) -> impl Iterator<Item = (DocId, T)> + '_ {
+        self.docid_cache
+            .iter()
+            .cloned()
+            .zip(self.val_cache.iter().cloned())
+    }
+}
+
+/// Given two sorted lists of docids `docs` and `hits`, hits is a subset of `docs`.
+/// Return all docs that are not in `hits`.
+fn find_missing_docs<F>(docs: &[u32], hits: &[u32], mut callback: F)
+where F: FnMut(u32) {
+    let mut docs_iter = docs.iter();
+    let mut hits_iter = hits.iter();
+
+    let mut doc = docs_iter.next();
+    let mut hit = hits_iter.next();
+
+    while let (Some(&current_doc), Some(&current_hit)) = (doc, hit) {
+        match current_doc.cmp(&current_hit) {
+            Ordering::Less => {
+                callback(current_doc);
+                doc = docs_iter.next();
+            }
+            Ordering::Equal => {
+                doc = docs_iter.next();
+                hit = hits_iter.next();
+            }
+            Ordering::Greater => {
+                hit = hits_iter.next();
+            }
+        }
+    }
+
+    while let Some(&current_doc) = doc {
+        callback(current_doc);
+        doc = docs_iter.next();
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn test_find_missing_docs() {
+        let docs: Vec<u32> = vec![1, 2, 3, 4, 5, 6, 7, 8, 9, 10];
+        let hits: Vec<u32> = vec![2, 4, 6, 8, 10];
+
+        let mut missing_docs: Vec<u32> = Vec::new();
+
+        find_missing_docs(&docs, &hits, |missing_doc| {
+            missing_docs.push(missing_doc);
+        });
+
+        assert_eq!(missing_docs, vec![1, 3, 5, 7, 9]);
+    }
+
+    #[test]
+    fn test_find_missing_docs_empty() {
+        let docs: Vec<u32> = Vec::new();
+        let hits: Vec<u32> = vec![2, 4, 6, 8, 10];
+
+        let mut missing_docs: Vec<u32> = Vec::new();
+
+        find_missing_docs(&docs, &hits, |missing_doc| {
+            missing_docs.push(missing_doc);
+        });
+
+        assert_eq!(missing_docs, vec![]);
+    }
+
+    #[test]
+    fn test_find_missing_docs_all_missing() {
+        let docs: Vec<u32> = vec![1, 2, 3, 4, 5];
+        let hits: Vec<u32> = Vec::new();
+
+        let mut missing_docs: Vec<u32> = Vec::new();
+
+        find_missing_docs(&docs, &hits, |missing_doc| {
+            missing_docs.push(missing_doc);
+        });
+
+        assert_eq!(missing_docs, vec![1, 2, 3, 4, 5]);
+    }
+}
--- a/columnar/src/column/dictionary_encoded.rs
+++ b/columnar/src/column/dictionary_encoded.rs
@@ -1,6 +1,6 @@
-use std::io;
 use std::ops::Deref;
 use std::sync::Arc;
+use std::{fmt, io};

 use sstable::{Dictionary, VoidSSTable};

@@ -21,7 +21,22 @@ pub struct BytesColumn {
    pub(crate) term_ord_column: Column<u64>,
 }

+impl fmt::Debug for BytesColumn {
+    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+        f.debug_struct("BytesColumn")
+            .field("term_ord_column", &self.term_ord_column)
+            .finish()
+    }
+}
+
 impl BytesColumn {
+    pub fn empty(num_docs: u32) -> BytesColumn {
+        BytesColumn {
+            dictionary: Arc::new(Dictionary::empty()),
+            term_ord_column: Column::build_empty_column(num_docs),
+        }
+    }
+
    /// Fills the given `output` buffer with the term associated to the ordinal `ord`.
    ///
    /// Returns `false` if the term does not exist (e.g. `term_ord` is greater or equal to the
@@ -32,30 +47,58 @@ impl BytesColumn {

    /// Returns the number of rows in the column.
    pub fn num_rows(&self) -> RowId {
-        self.term_ord_column.num_rows()
+        self.term_ord_column.num_docs()
+    }
+
+    pub fn term_ords(&self, row_id: RowId) -> impl Iterator<Item = u64> + '_ {
+        self.term_ord_column.values_for_doc(row_id)
    }

    /// Returns the column of ordinals
    pub fn ords(&self) -> &Column<u64> {
        &self.term_ord_column
    }
+
+    pub fn num_terms(&self) -> usize {
+        self.dictionary.num_terms()
+    }
+
+    pub fn dictionary(&self) -> &Dictionary<VoidSSTable> {
+        self.dictionary.as_ref()
+    }
 }

 #[derive(Clone)]
 pub struct StrColumn(BytesColumn);

-impl From<BytesColumn> for StrColumn {
-    fn from(bytes_col: BytesColumn) -> Self {
-        StrColumn(bytes_col)
+impl fmt::Debug for StrColumn {
+    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
+        write!(f, "{:?}", self.term_ord_column)
+    }
+}
+
+impl From<StrColumn> for BytesColumn {
+    fn from(str_column: StrColumn) -> BytesColumn {
+        str_column.0
    }
 }

 impl StrColumn {
+    pub fn wrap(bytes_column: BytesColumn) -> StrColumn {
+        StrColumn(bytes_column)
+    }
+
+    pub fn dictionary(&self) -> &Dictionary<VoidSSTable> {
+        self.0.dictionary.as_ref()
+    }
+
    /// Fills the buffer
    pub fn ord_to_str(&self, term_ord: u64, output: &mut String) -> io::Result<bool> {
        unsafe {
            let buf = output.as_mut_vec();
-            self.0.dictionary.ord_to_term(term_ord, buf)?;
+            if !self.0.dictionary.ord_to_term(term_ord, buf)? {
+                return Ok(false);
+            }
            // TODO consider remove checks if it hurts performance.
            if std::str::from_utf8(buf.as_slice()).is_err() {
                buf.clear();
--- a/columnar/src/column/mod.rs
+++ b/columnar/src/column/mod.rs
@@ -1,35 +1,76 @@
 mod dictionary_encoded;
 mod serialize;

-use std::ops::Deref;
+use std::fmt::{self, Debug};
+use std::io::Write;
+use std::ops::{Deref, Range, RangeInclusive};
 use std::sync::Arc;

 use common::BinarySerializable;
 pub use dictionary_encoded::{BytesColumn, StrColumn};
 pub use serialize::{
-    open_column_bytes, open_column_u128, open_column_u64, serialize_column_mappable_to_u128,
-    serialize_column_mappable_to_u64,
+    open_column_bytes, open_column_str, open_column_u128, open_column_u64,
+    serialize_column_mappable_to_u128, serialize_column_mappable_to_u64,
 };

 use crate::column_index::ColumnIndex;
-use crate::column_values::ColumnValues;
-use crate::{Cardinality, RowId};
+use crate::column_values::monotonic_mapping::StrictlyMonotonicMappingToInternal;
+use crate::column_values::{monotonic_map_column, ColumnValues};
+use crate::{Cardinality, DocId, EmptyColumnValues, MonotonicallyMappableToU64, RowId};

 #[derive(Clone)]
-pub struct Column<T> {
-    pub idx: ColumnIndex<'static>,
+pub struct Column<T = u64> {
+    pub index: ColumnIndex,
    pub values: Arc<dyn ColumnValues<T>>,
 }

-impl<T: PartialOrd> Column<T> {
-    pub fn num_rows(&self) -> RowId {
-        match &self.idx {
-            ColumnIndex::Full => self.values.num_vals() as u32,
-            ColumnIndex::Optional(optional_index) => optional_index.num_rows(),
+impl<T: Debug + PartialOrd + Send + Sync + Copy + 'static> Debug for Column<T> {
+    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
+        let num_docs = self.num_docs();
+        let entries = (0..num_docs)
+            .map(|i| (i, self.values_for_doc(i).collect::<Vec<_>>()))
+            .filter(|(_, vals)| !vals.is_empty());
+        f.debug_map().entries(entries).finish()
+    }
+}
+
+impl<T: PartialOrd + Default> Column<T> {
+    pub fn build_empty_column(num_docs: u32) -> Column<T> {
+        Column {
+            index: ColumnIndex::Empty { num_docs },
+            values: Arc::new(EmptyColumnValues),
+        }
+    }
+}
+
+impl<T: MonotonicallyMappableToU64> Column<T> {
+    pub fn to_u64_monotonic(self) -> Column<u64> {
+        let values = Arc::new(monotonic_map_column(
+            self.values,
+            StrictlyMonotonicMappingToInternal::<T>::new(),
+        ));
+        Column {
+            index: self.index,
+            values,
+        }
+    }
+}
+
+impl<T: PartialOrd + Copy + Debug + Send + Sync + 'static> Column<T> {
+    #[inline]
+    pub fn get_cardinality(&self) -> Cardinality {
+        self.index.get_cardinality()
+    }
+
+    pub fn num_docs(&self) -> RowId {
+        match &self.index {
+            ColumnIndex::Empty { num_docs } => *num_docs,
+            ColumnIndex::Full => self.values.num_vals(),
+            ColumnIndex::Optional(optional_index) => optional_index.num_docs(),
            ColumnIndex::Multivalued(col_index) => {
                // The multivalued index contains all value start row_id,
                // and one extra value at the end with the overall number of rows.
-                col_index.num_vals() - 1
+                col_index.num_docs()
            }
        }
    }
@@ -37,21 +78,67 @@ impl<T: PartialOrd> Column<T> {
    pub fn min_value(&self) -> T {
        self.values.min_value()
    }
+
    pub fn max_value(&self) -> T {
        self.values.max_value()
    }
-}

-impl<T: PartialOrd + Copy + Send + Sync + 'static> Column<T> {
    pub fn first(&self, row_id: RowId) -> Option<T> {
-        self.values(row_id).next()
+        self.values_for_doc(row_id).next()
    }

-    pub fn values(&self, row_id: RowId) -> impl Iterator<Item = T> + '_ {
-        self.value_row_ids(row_id)
+    /// Translates a block of docis to row_ids.
+    ///
+    /// returns the row_ids and the matching docids on the same index
+    /// e.g.
+    /// DocId In:  [0, 5, 6]
+    /// DocId Out: [0, 0, 6, 6]
+    /// RowId Out: [0, 1, 2, 3]
+    #[inline]
+    pub fn row_ids_for_docs(
+        &self,
+        doc_ids: &[DocId],
+        doc_ids_out: &mut Vec<DocId>,
+        row_ids: &mut Vec<RowId>,
+    ) {
+        self.index.docids_to_rowids(doc_ids, doc_ids_out, row_ids)
+    }
+
+    pub fn values_for_doc(&self, doc_id: DocId) -> impl Iterator<Item = T> + '_ {
+        self.value_row_ids(doc_id)
            .map(|value_row_id: RowId| self.values.get_val(value_row_id))
    }

+    /// Get the docids of values which are in the provided value range.
+    #[inline]
+    pub fn get_docids_for_value_range(
+        &self,
+        value_range: RangeInclusive<T>,
+        selected_docid_range: Range<u32>,
+        doc_ids: &mut Vec<u32>,
+    ) {
+        // convert passed docid range to row id range
+        let rowid_range = self
+            .index
+            .docid_range_to_rowids(selected_docid_range.clone());
+
+        // Load rows
+        self.values
+            .get_row_ids_for_value_range(value_range, rowid_range, doc_ids);
+        // Convert rows to docids
+        self.index
+            .select_batch_in_place(selected_docid_range.start, doc_ids);
+    }
+
+    /// Fills the output vector with the (possibly multiple values that are associated_with
+    /// `row_id`.
+    ///
+    /// This method clears the `output` vector.
+    pub fn fill_vals(&self, row_id: RowId, output: &mut Vec<T>) {
+        output.clear();
+        output.extend(self.values_for_doc(row_id));
+    }
+
    pub fn first_or_default_col(self, default_value: T) -> Arc<dyn ColumnValues<T>> {
        Arc::new(FirstValueWithDefault {
            column: self,
@@ -61,15 +148,15 @@ impl<T: PartialOrd + Copy + Send + Sync + 'static> Column<T> {
 }

 impl<T> Deref for Column<T> {
-    type Target = ColumnIndex<'static>;
+    type Target = ColumnIndex;

    fn deref(&self) -> &Self::Target {
-        &self.idx
+        &self.index
    }
 }

 impl BinarySerializable for Cardinality {
-    fn serialize<W: std::io::Write>(&self, writer: &mut W) -> std::io::Result<()> {
+    fn serialize<W: Write + ?Sized>(&self, writer: &mut W) -> std::io::Result<()> {
        self.to_code().serialize(writer)
    }

@@ -86,7 +173,9 @@ struct FirstValueWithDefault<T: Copy> {
    default_value: T,
 }

-impl<T: PartialOrd + Send + Sync + Copy + 'static> ColumnValues<T> for FirstValueWithDefault<T> {
+impl<T: PartialOrd + Debug + Send + Sync + Copy + 'static> ColumnValues<T>
+    for FirstValueWithDefault<T>
+{
    fn get_val(&self, idx: u32) -> T {
        self.column.first(idx).unwrap_or(self.default_value)
    }
@@ -100,10 +189,11 @@ impl<T: PartialOrd + Send + Sync + Copy + 'static> ColumnValues<T> for FirstValu
    }

    fn num_vals(&self) -> u32 {
-        match &self.column.idx {
+        match &self.column.index {
+            ColumnIndex::Empty { .. } => 0u32,
            ColumnIndex::Full => self.column.values.num_vals(),
-            ColumnIndex::Optional(optional_idx) => optional_idx.num_rows(),
-            ColumnIndex::Multivalued(_) => todo!(),
+            ColumnIndex::Optional(optional_idx) => optional_idx.num_docs(),
+            ColumnIndex::Multivalued(multivalue_idx) => multivalue_idx.num_docs(),
        }
    }
 }
--- a/columnar/src/column/serialize.rs
+++ b/columnar/src/column/serialize.rs
@@ -7,44 +7,33 @@ use sstable::Dictionary;

 use crate::column::{BytesColumn, Column};
 use crate::column_index::{serialize_column_index, SerializableColumnIndex};
-use crate::column_values::serialize::serialize_column_values_u128;
 use crate::column_values::{
-    serialize_column_values, ColumnValues, FastFieldCodecType, MonotonicallyMappableToU128,
-    MonotonicallyMappableToU64,
+    load_u64_based_column_values, serialize_column_values_u128, serialize_u64_based_column_values,
+    CodecType, MonotonicallyMappableToU128, MonotonicallyMappableToU64,
 };
+use crate::iterable::Iterable;
+use crate::StrColumn;

-pub fn serialize_column_mappable_to_u128<
-    F: Fn() -> I,
-    I: Iterator<Item = T>,
-    T: MonotonicallyMappableToU128,
->(
+pub fn serialize_column_mappable_to_u128<T: MonotonicallyMappableToU128>(
    column_index: SerializableColumnIndex<'_>,
-    column_values: F,
-    num_vals: u32,
+    iterable: &dyn Iterable<T>,
    output: &mut impl Write,
 ) -> io::Result<()> {
    let column_index_num_bytes = serialize_column_index(column_index, output)?;
-    serialize_column_values_u128(
-        || column_values().map(|val| val.to_u128()),
-        num_vals,
-        output,
-    )?;
+    serialize_column_values_u128(iterable, output)?;
    output.write_all(&column_index_num_bytes.to_le_bytes())?;
    Ok(())
 }

 pub fn serialize_column_mappable_to_u64<T: MonotonicallyMappableToU64>(
    column_index: SerializableColumnIndex<'_>,
-    column_values: &impl ColumnValues<T>,
+    column_values: &impl Iterable<T>,
    output: &mut impl Write,
 ) -> io::Result<()> {
    let column_index_num_bytes = serialize_column_index(column_index, output)?;
-    serialize_column_values(
+    serialize_u64_based_column_values(
        column_values,
-        &[
-            FastFieldCodecType::Bitpacked,
-            FastFieldCodecType::BlockwiseLinear,
-        ],
+        &[CodecType::Bitpacked, CodecType::BlockwiseLinear],
        output,
    )?;
    output.write_all(&column_index_num_bytes.to_le_bytes())?;
@@ -61,9 +50,9 @@ pub fn open_column_u64<T: MonotonicallyMappableToU64>(bytes: OwnedBytes) -> io::
    );
    let (column_index_data, column_values_data) = body.split(column_index_num_bytes as usize);
    let column_index = crate::column_index::open_column_index(column_index_data)?;
-    let column_values = crate::column_values::open_u64_mapped(column_values_data)?;
+    let column_values = load_u64_based_column_values(column_values_data)?;
    Ok(Column {
-        idx: column_index,
+        index: column_index,
        values: column_values,
    })
 }
@@ -82,20 +71,24 @@ pub fn open_column_u128<T: MonotonicallyMappableToU128>(
    let column_index = crate::column_index::open_column_index(column_index_data)?;
    let column_values = crate::column_values::open_u128_mapped(column_values_data)?;
    Ok(Column {
-        idx: column_index,
+        index: column_index,
        values: column_values,
    })
 }

-pub fn open_column_bytes<T: From<BytesColumn>>(data: OwnedBytes) -> io::Result<T> {
+pub fn open_column_bytes(data: OwnedBytes) -> io::Result<BytesColumn> {
    let (body, dictionary_len_bytes) = data.rsplit(4);
    let dictionary_len = u32::from_le_bytes(dictionary_len_bytes.as_slice().try_into().unwrap());
    let (dictionary_bytes, column_bytes) = body.split(dictionary_len as usize);
    let dictionary = Arc::new(Dictionary::from_bytes(dictionary_bytes)?);
    let term_ord_column = crate::column::open_column_u64::<u64>(column_bytes)?;
-    let bytes_column = BytesColumn {
+    Ok(BytesColumn {
        dictionary,
        term_ord_column,
-    };
-    Ok(bytes_column.into())
+    })
+}
+
+pub fn open_column_str(data: OwnedBytes) -> io::Result<StrColumn> {
+    let bytes_column = open_column_bytes(data)?;
+    Ok(StrColumn::wrap(bytes_column))
 }
--- a/columnar/src/column_index/merge/mod.rs
+++ b/columnar/src/column_index/merge/mod.rs
@@ -0,0 +1,210 @@
+mod shuffled;
+mod stacked;
+
+use common::ReadOnlyBitSet;
+use shuffled::merge_column_index_shuffled;
+use stacked::merge_column_index_stacked;
+
+use crate::column_index::SerializableColumnIndex;
+use crate::{Cardinality, ColumnIndex, MergeRowOrder};
+
+fn detect_cardinality_single_column_index(
+    column_index: &ColumnIndex,
+    alive_bitset_opt: &Option<ReadOnlyBitSet>,
+) -> Cardinality {
+    let Some(alive_bitset) = alive_bitset_opt else {
+        return column_index.get_cardinality();
+    };
+    let cardinality_before_deletes = column_index.get_cardinality();
+    if cardinality_before_deletes == Cardinality::Full {
+        // The columnar cardinality can only become more restrictive in the presence of deletes
+        // (where cardinality sorted from the more restrictive to the least restrictive are Full,
+        // Optional, Multivalued)
+        //
+        // If we are already "Full", we are guaranteed to stay "Full" after deletes.
+        return Cardinality::Full;
+    }
+    let mut cardinality_so_far = Cardinality::Full;
+    for doc_id in alive_bitset.iter() {
+        let num_values = column_index.value_row_ids(doc_id).len();
+        let row_cardinality = match num_values {
+            0 => Cardinality::Optional,
+            1 => Cardinality::Full,
+            _ => Cardinality::Multivalued,
+        };
+        cardinality_so_far = cardinality_so_far.max(row_cardinality);
+        if cardinality_so_far >= cardinality_before_deletes {
+            // There won't be any improvement in the cardinality.
+            // We can early exit.
+            return cardinality_before_deletes;
+        }
+    }
+    cardinality_so_far
+}
+
+fn detect_cardinality(
+    column_indexes: &[ColumnIndex],
+    merge_row_order: &MergeRowOrder,
+) -> Cardinality {
+    match merge_row_order {
+        MergeRowOrder::Stack(_) => column_indexes
+            .iter()
+            .map(ColumnIndex::get_cardinality)
+            .max()
+            .unwrap_or(Cardinality::Full),
+        MergeRowOrder::Shuffled(shuffle_merge_order) => {
+            let mut merged_cardinality = Cardinality::Full;
+            for (column_index, alive_bitset_opt) in column_indexes
+                .iter()
+                .zip(shuffle_merge_order.alive_bitsets.iter())
+            {
+                let cardinality: Cardinality =
+                    detect_cardinality_single_column_index(column_index, alive_bitset_opt);
+                if cardinality == Cardinality::Multivalued {
+                    return cardinality;
+                }
+                merged_cardinality = merged_cardinality.max(cardinality);
+            }
+            merged_cardinality
+        }
+    }
+}
+
+pub fn merge_column_index<'a>(
+    columns: &'a [ColumnIndex],
+    merge_row_order: &'a MergeRowOrder,
+) -> SerializableColumnIndex<'a> {
+    // For simplification, we do not try to detect whether the cardinality could be
+    // downgraded thanks to deletes.
+    let cardinality_after_merge = detect_cardinality(columns, merge_row_order);
+    match merge_row_order {
+        MergeRowOrder::Stack(stack_merge_order) => {
+            merge_column_index_stacked(columns, cardinality_after_merge, stack_merge_order)
+        }
+        MergeRowOrder::Shuffled(complex_merge_order) => {
+            merge_column_index_shuffled(columns, cardinality_after_merge, complex_merge_order)
+        }
+    }
+}
+
+// TODO actually, the shuffled code path is a bit too general.
+// In practise, we do not really shuffle everything.
+// The merge order restricted to a specific column keeps the original row order.
+//
+// This may offer some optimization that we have not explored yet.
+
+#[cfg(test)]
+mod tests {
+    use crate::column_index::merge::detect_cardinality;
+    use crate::column_index::multivalued_index::MultiValueIndex;
+    use crate::column_index::{merge_column_index, OptionalIndex, SerializableColumnIndex};
+    use crate::{
+        Cardinality, ColumnIndex, MergeRowOrder, RowAddr, RowId, ShuffleMergeOrder, StackMergeOrder,
+    };
+
+    #[test]
+    fn test_detect_cardinality() {
+        assert_eq!(
+            detect_cardinality(&[], &StackMergeOrder::stack_for_test(&[]).into()),
+            Cardinality::Full
+        );
+        let optional_index: ColumnIndex = OptionalIndex::for_test(1, &[]).into();
+        let multivalued_index: ColumnIndex = MultiValueIndex::for_test(&[0, 1]).into();
+        assert_eq!(
+            detect_cardinality(
+                &[optional_index.clone(), ColumnIndex::Empty { num_docs: 0 }],
+                &StackMergeOrder::stack_for_test(&[1, 0]).into()
+            ),
+            Cardinality::Optional
+        );
+        assert_eq!(
+            detect_cardinality(
+                &[optional_index.clone(), ColumnIndex::Full],
+                &StackMergeOrder::stack_for_test(&[1, 1]).into()
+            ),
+            Cardinality::Optional
+        );
+        assert_eq!(
+            detect_cardinality(
+                &[
+                    multivalued_index.clone(),
+                    ColumnIndex::Empty { num_docs: 0 }
+                ],
+                &StackMergeOrder::stack_for_test(&[1, 0]).into()
+            ),
+            Cardinality::Multivalued
+        );
+        assert_eq!(
+            detect_cardinality(
+                &[multivalued_index.clone(), optional_index.clone()],
+                &StackMergeOrder::stack_for_test(&[1, 1]).into()
+            ),
+            Cardinality::Multivalued
+        );
+        assert_eq!(
+            detect_cardinality(
+                &[optional_index, multivalued_index],
+                &StackMergeOrder::stack_for_test(&[1, 1]).into()
+            ),
+            Cardinality::Multivalued
+        );
+    }
+
+    #[test]
+    fn test_merge_index_multivalued_sorted() {
+        let column_indexes: Vec<ColumnIndex> = vec![MultiValueIndex::for_test(&[0, 2, 5]).into()];
+        let merge_row_order: MergeRowOrder = ShuffleMergeOrder::for_test(
+            &[2],
+            vec![
+                RowAddr {
+                    segment_ord: 0u32,
+                    row_id: 1u32,
+                },
+                RowAddr {
+                    segment_ord: 0u32,
+                    row_id: 0u32,
+                },
+            ],
+        )
+        .into();
+        let merged_column_index = merge_column_index(&column_indexes[..], &merge_row_order);
+        let SerializableColumnIndex::Multivalued(start_index_iterable) = merged_column_index else {
+            panic!("Excpected a multivalued index")
+        };
+        let start_indexes: Vec<RowId> = start_index_iterable.boxed_iter().collect();
+        assert_eq!(&start_indexes, &[0, 3, 5]);
+    }
+
+    #[test]
+    fn test_merge_index_multivalued_sorted_several_segment() {
+        let column_indexes: Vec<ColumnIndex> = vec![
+            MultiValueIndex::for_test(&[0, 2, 5]).into(),
+            ColumnIndex::Empty { num_docs: 0 },
+            MultiValueIndex::for_test(&[0, 1, 4]).into(),
+        ];
+        let merge_row_order: MergeRowOrder = ShuffleMergeOrder::for_test(
+            &[2, 0, 2],
+            vec![
+                RowAddr {
+                    segment_ord: 2u32,
+                    row_id: 1u32,
+                },
+                RowAddr {
+                    segment_ord: 0u32,
+                    row_id: 0u32,
+                },
+                RowAddr {
+                    segment_ord: 2u32,
+                    row_id: 0u32,
+                },
+            ],
+        )
+        .into();
+        let merged_column_index = merge_column_index(&column_indexes[..], &merge_row_order);
+        let SerializableColumnIndex::Multivalued(start_index_iterable) = merged_column_index else {
+            panic!("Excpected a multivalued index")
+        };
+        let start_indexes: Vec<RowId> = start_index_iterable.boxed_iter().collect();
+        assert_eq!(&start_indexes, &[0, 3, 5, 6]);
+    }
+}
--- a/columnar/src/column_index/merge/shuffled.rs
+++ b/columnar/src/column_index/merge/shuffled.rs
@@ -0,0 +1,171 @@
+use std::iter;
+
+use crate::column_index::{SerializableColumnIndex, Set};
+use crate::iterable::Iterable;
+use crate::{Cardinality, ColumnIndex, RowId, ShuffleMergeOrder};
+
+pub fn merge_column_index_shuffled<'a>(
+    column_indexes: &'a [ColumnIndex],
+    cardinality_after_merge: Cardinality,
+    shuffle_merge_order: &'a ShuffleMergeOrder,
+) -> SerializableColumnIndex<'a> {
+    match cardinality_after_merge {
+        Cardinality::Full => SerializableColumnIndex::Full,
+        Cardinality::Optional => {
+            let non_null_row_ids =
+                merge_column_index_shuffled_optional(column_indexes, shuffle_merge_order);
+            SerializableColumnIndex::Optional {
+                non_null_row_ids,
+                num_rows: shuffle_merge_order.num_rows(),
+            }
+        }
+        Cardinality::Multivalued => {
+            let multivalue_start_index =
+                merge_column_index_shuffled_multivalued(column_indexes, shuffle_merge_order);
+            SerializableColumnIndex::Multivalued(multivalue_start_index)
+        }
+    }
+}
+
+/// Merge several column indexes into one, ordering rows according to the merge_order passed as
+/// argument. While it is true that the `merge_order` may imply deletes and hence could in theory a
+/// multivalued index into an optional one, this is not supported today for simplification.
+///
+/// In other words the column_indexes passed as argument may NOT be multivalued.
+fn merge_column_index_shuffled_optional<'a>(
+    column_indexes: &'a [ColumnIndex],
+    merge_order: &'a ShuffleMergeOrder,
+) -> Box<dyn Iterable<RowId> + 'a> {
+    Box::new(ShuffledIndex {
+        column_indexes,
+        merge_order,
+    })
+}
+
+struct ShuffledIndex<'a> {
+    column_indexes: &'a [ColumnIndex],
+    merge_order: &'a ShuffleMergeOrder,
+}
+
+impl<'a> Iterable<u32> for ShuffledIndex<'a> {
+    fn boxed_iter(&self) -> Box<dyn Iterator<Item = u32> + '_> {
+        Box::new(
+            self.merge_order
+                .iter_new_to_old_row_addrs()
+                .enumerate()
+                .filter_map(|(new_row_id, old_row_addr)| {
+                    let column_index = &self.column_indexes[old_row_addr.segment_ord as usize];
+                    let row_id = new_row_id as u32;
+                    if column_index.has_value(old_row_addr.row_id) {
+                        Some(row_id)
+                    } else {
+                        None
+                    }
+                }),
+        )
+    }
+}
+
+fn merge_column_index_shuffled_multivalued<'a>(
+    column_indexes: &'a [ColumnIndex],
+    merge_order: &'a ShuffleMergeOrder,
+) -> Box<dyn Iterable<RowId> + 'a> {
+    Box::new(ShuffledMultivaluedIndex {
+        column_indexes,
+        merge_order,
+    })
+}
+
+struct ShuffledMultivaluedIndex<'a> {
+    column_indexes: &'a [ColumnIndex],
+    merge_order: &'a ShuffleMergeOrder,
+}
+
+fn iter_num_values<'a>(
+    column_indexes: &'a [ColumnIndex],
+    merge_order: &'a ShuffleMergeOrder,
+) -> impl Iterator<Item = u32> + 'a {
+    merge_order.iter_new_to_old_row_addrs().map(|row_addr| {
+        let column_index = &column_indexes[row_addr.segment_ord as usize];
+        match column_index {
+            ColumnIndex::Empty { .. } => 0u32,
+            ColumnIndex::Full => 1,
+            ColumnIndex::Optional(optional_index) => {
+                u32::from(optional_index.contains(row_addr.row_id))
+            }
+            ColumnIndex::Multivalued(multivalued_index) => {
+                multivalued_index.range(row_addr.row_id).len() as u32
+            }
+        }
+    })
+}
+
+/// Transforms an iterator containing the number of vals per row (with `num_rows` elements)
+/// into a `start_offset` iterator starting at 0 and (with `num_rows + 1` element)
+fn integrate_num_vals(num_vals: impl Iterator<Item = u32>) -> impl Iterator<Item = RowId> {
+    iter::once(0u32).chain(num_vals.scan(0, |state, num_vals| {
+        *state += num_vals;
+        Some(*state)
+    }))
+}
+
+impl<'a> Iterable<u32> for ShuffledMultivaluedIndex<'a> {
+    fn boxed_iter(&self) -> Box<dyn Iterator<Item = u32> + '_> {
+        let num_vals_per_row = iter_num_values(self.column_indexes, self.merge_order);
+        Box::new(integrate_num_vals(num_vals_per_row))
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use crate::column_index::OptionalIndex;
+    use crate::RowAddr;
+
+    #[test]
+    fn test_integrate_num_vals_empty() {
+        assert!(integrate_num_vals(iter::empty()).eq(iter::once(0)));
+    }
+
+    #[test]
+    fn test_integrate_num_vals_one_el() {
+        assert!(integrate_num_vals(iter::once(10)).eq([0, 10].into_iter()));
+    }
+
+    #[test]
+    fn test_integrate_num_vals_several() {
+        assert!(integrate_num_vals([3, 0, 10, 20].into_iter()).eq([0, 3, 3, 13, 33].into_iter()));
+    }
+
+    #[test]
+    fn test_merge_column_index_optional_shuffle() {
+        let optional_index: ColumnIndex = OptionalIndex::for_test(2, &[0]).into();
+        let column_indexes = vec![optional_index, ColumnIndex::Full];
+        let row_addrs = vec![
+            RowAddr {
+                segment_ord: 0u32,
+                row_id: 1u32,
+            },
+            RowAddr {
+                segment_ord: 1u32,
+                row_id: 0u32,
+            },
+        ];
+        let shuffle_merge_order = ShuffleMergeOrder::for_test(&[2, 1], row_addrs);
+        let serializable_index = merge_column_index_shuffled(
+            &column_indexes[..],
+            Cardinality::Optional,
+            &shuffle_merge_order,
+        );
+        let SerializableColumnIndex::Optional {
+            non_null_row_ids,
+            num_rows,
+        } = serializable_index
+        else {
+            panic!()
+        };
+        assert_eq!(num_rows, 2);
+        let non_null_rows: Vec<RowId> = non_null_row_ids.boxed_iter().collect();
+        assert_eq!(&non_null_rows, &[1]);
+    }
+}
--- a/columnar/src/column_index/merge/stacked.rs
+++ b/columnar/src/column_index/merge/stacked.rs
@@ -0,0 +1,151 @@
+use std::iter;
+
+use crate::column_index::{SerializableColumnIndex, Set};
+use crate::iterable::Iterable;
+use crate::{Cardinality, ColumnIndex, RowId, StackMergeOrder};
+
+/// Simple case:
+/// The new mapping just consists in stacking the different column indexes.
+///
+/// There are no sort nor deletes involved.
+pub fn merge_column_index_stacked<'a>(
+    columns: &'a [ColumnIndex],
+    cardinality_after_merge: Cardinality,
+    stack_merge_order: &'a StackMergeOrder,
+) -> SerializableColumnIndex<'a> {
+    match cardinality_after_merge {
+        Cardinality::Full => SerializableColumnIndex::Full,
+        Cardinality::Optional => SerializableColumnIndex::Optional {
+            non_null_row_ids: Box::new(StackedOptionalIndex {
+                columns,
+                stack_merge_order,
+            }),
+            num_rows: stack_merge_order.num_rows(),
+        },
+        Cardinality::Multivalued => {
+            let stacked_multivalued_index = StackedMultivaluedIndex {
+                columns,
+                stack_merge_order,
+            };
+            SerializableColumnIndex::Multivalued(Box::new(stacked_multivalued_index))
+        }
+    }
+}
+
+struct StackedOptionalIndex<'a> {
+    columns: &'a [ColumnIndex],
+    stack_merge_order: &'a StackMergeOrder,
+}
+
+impl<'a> Iterable<RowId> for StackedOptionalIndex<'a> {
+    fn boxed_iter(&self) -> Box<dyn Iterator<Item = RowId> + 'a> {
+        Box::new(
+            self.columns
+                .iter()
+                .enumerate()
+                .flat_map(|(columnar_id, column_index_opt)| {
+                    let columnar_row_range = self.stack_merge_order.columnar_range(columnar_id);
+                    let rows_it: Box<dyn Iterator<Item = RowId>> = match column_index_opt {
+                        ColumnIndex::Full => Box::new(columnar_row_range),
+                        ColumnIndex::Optional(optional_index) => Box::new(
+                            optional_index
+                                .iter_rows()
+                                .map(move |row_id: RowId| columnar_row_range.start + row_id),
+                        ),
+                        ColumnIndex::Multivalued(_) => {
+                            panic!("No multivalued index is allowed when stacking column index");
+                        }
+                        ColumnIndex::Empty { .. } => Box::new(std::iter::empty()),
+                    };
+                    rows_it
+                }),
+        )
+    }
+}
+
+#[derive(Clone, Copy)]
+struct StackedMultivaluedIndex<'a> {
+    columns: &'a [ColumnIndex],
+    stack_merge_order: &'a StackMergeOrder,
+}
+
+fn convert_column_opt_to_multivalued_index<'a>(
+    column_index_opt: &'a ColumnIndex,
+    num_rows: RowId,
+) -> Box<dyn Iterator<Item = RowId> + 'a> {
+    match column_index_opt {
+        ColumnIndex::Empty { .. } => Box::new(iter::repeat(0u32).take(num_rows as usize + 1)),
+        ColumnIndex::Full => Box::new(0..num_rows + 1),
+        ColumnIndex::Optional(optional_index) => {
+            Box::new(
+                (0..num_rows)
+                    // TODO optimize
+                    .map(|row_id| optional_index.rank(row_id))
+                    .chain(std::iter::once(optional_index.num_non_nulls())),
+            )
+        }
+        ColumnIndex::Multivalued(multivalued_index) => multivalued_index.start_index_column.iter(),
+    }
+}
+
+impl<'a> Iterable<RowId> for StackedMultivaluedIndex<'a> {
+    fn boxed_iter(&self) -> Box<dyn Iterator<Item = RowId> + '_> {
+        let multivalued_indexes =
+            self.columns
+                .iter()
+                .enumerate()
+                .map(|(columnar_id, column_opt)| {
+                    let num_rows =
+                        self.stack_merge_order.columnar_range(columnar_id).len() as RowId;
+                    convert_column_opt_to_multivalued_index(column_opt, num_rows)
+                });
+        stack_multivalued_indexes(multivalued_indexes)
+    }
+}
+
+// Refactor me
+fn stack_multivalued_indexes<'a>(
+    mut multivalued_indexes: impl Iterator<Item = Box<dyn Iterator<Item = RowId> + 'a>> + 'a,
+) -> Box<dyn Iterator<Item = RowId> + 'a> {
+    let mut offset = 0;
+    let mut last_row_id = 0;
+    let mut current_it = multivalued_indexes.next();
+    Box::new(std::iter::from_fn(move || loop {
+        let Some(multivalued_index) = current_it.as_mut() else {
+            return None;
+        };
+        if let Some(row_id) = multivalued_index.next() {
+            last_row_id = offset + row_id;
+            return Some(last_row_id);
+        }
+        offset = last_row_id;
+        loop {
+            current_it = multivalued_indexes.next();
+            if current_it.as_mut()?.next().is_some() {
+                break;
+            }
+        }
+    }))
+}
+
+#[cfg(test)]
+mod tests {
+    use crate::RowId;
+
+    fn it<'a>(row_ids: &'a [RowId]) -> Box<dyn Iterator<Item = RowId> + 'a> {
+        Box::new(row_ids.iter().copied())
+    }
+
+    #[test]
+    fn test_stack() {
+        let columns = [
+            it(&[0u32, 0u32]),
+            it(&[0u32, 1u32, 1u32, 4u32]),
+            it(&[0u32, 3u32, 5u32]),
+            it(&[0u32, 4u32]),
+        ]
+        .into_iter();
+        let start_offsets: Vec<RowId> = super::stack_multivalued_indexes(columns).collect();
+        assert_eq!(start_offsets, &[0, 0, 1, 1, 4, 7, 9, 13]);
+    }
+}
--- a/columnar/src/column_index/mod.rs
+++ b/columnar/src/column_index/mod.rs
@@ -1,54 +1,180 @@
+mod merge;
 mod multivalued_index;
 mod optional_index;
 mod serialize;

 use std::ops::Range;
-use std::sync::Arc;

-pub use optional_index::{OptionalIndex, SerializableOptionalIndex, Set};
+pub use merge::merge_column_index;
+pub use optional_index::{OptionalIndex, Set};
 pub use serialize::{open_column_index, serialize_column_index, SerializableColumnIndex};

-use crate::column_values::ColumnValues;
-use crate::{Cardinality, RowId};
+use crate::column_index::multivalued_index::MultiValueIndex;
+use crate::{Cardinality, DocId, RowId};

-#[derive(Clone)]
-pub enum ColumnIndex<'a> {
+#[derive(Clone, Debug)]
+pub enum ColumnIndex {
+    Empty {
+        num_docs: u32,
+    },
    Full,
    Optional(OptionalIndex),
-    // TODO Remove the static by fixing the codec if possible.
-    /// The column values enclosed contains for all row_id,
-    /// the value start_index.
-    ///
    /// In addition, at index num_rows, an extra value is added
    /// containing the overal number of values.
-    Multivalued(Arc<dyn ColumnValues<RowId> + 'a>),
+    Multivalued(MultiValueIndex),
 }

-impl<'a> ColumnIndex<'a> {
+impl From<OptionalIndex> for ColumnIndex {
+    fn from(optional_index: OptionalIndex) -> ColumnIndex {
+        ColumnIndex::Optional(optional_index)
+    }
+}
+
+impl From<MultiValueIndex> for ColumnIndex {
+    fn from(multi_value_index: MultiValueIndex) -> ColumnIndex {
+        ColumnIndex::Multivalued(multi_value_index)
+    }
+}
+
+impl ColumnIndex {
+    #[inline]
+    pub fn is_multivalue(&self) -> bool {
+        matches!(self, ColumnIndex::Multivalued(_))
+    }
+    // Returns the cardinality of the column index.
+    //
+    // By convention, if the column contains no docs, we consider that it is
+    // full.
+    #[inline]
    pub fn get_cardinality(&self) -> Cardinality {
        match self {
-            ColumnIndex::Full => Cardinality::Full,
+            ColumnIndex::Empty { num_docs: 0 } | ColumnIndex::Full => Cardinality::Full,
+            ColumnIndex::Empty { .. } => Cardinality::Optional,
            ColumnIndex::Optional(_) => Cardinality::Optional,
            ColumnIndex::Multivalued(_) => Cardinality::Multivalued,
        }
    }

-    pub fn value_row_ids(&self, row_id: RowId) -> Range<RowId> {
+    /// Returns true if and only if there are at least one value associated to the row.
+    pub fn has_value(&self, doc_id: DocId) -> bool {
        match self {
-            ColumnIndex::Full => row_id..row_id + 1,
+            ColumnIndex::Empty { .. } => false,
+            ColumnIndex::Full => true,
+            ColumnIndex::Optional(optional_index) => optional_index.contains(doc_id),
+            ColumnIndex::Multivalued(multivalued_index) => {
+                !multivalued_index.range(doc_id).is_empty()
+            }
+        }
+    }
+
+    pub fn value_row_ids(&self, doc_id: DocId) -> Range<RowId> {
+        match self {
+            ColumnIndex::Empty { .. } => 0..0,
+            ColumnIndex::Full => doc_id..doc_id + 1,
            ColumnIndex::Optional(optional_index) => {
-                if let Some(val) = optional_index.rank_if_exists(row_id) {
+                if let Some(val) = optional_index.rank_if_exists(doc_id) {
                    val..val + 1
                } else {
                    0..0
                }
            }
+            ColumnIndex::Multivalued(multivalued_index) => multivalued_index.range(doc_id),
+        }
+    }
+
+    /// Translates a block of docis to row_ids.
+    ///
+    /// returns the row_ids and the matching docids on the same index
+    /// e.g.
+    /// DocId In:  [0, 5, 6]
+    /// DocId Out: [0, 0, 6, 6]
+    /// RowId Out: [0, 1, 2, 3]
+    #[inline]
+    pub fn docids_to_rowids(
+        &self,
+        doc_ids: &[DocId],
+        doc_ids_out: &mut Vec<DocId>,
+        row_ids: &mut Vec<RowId>,
+    ) {
+        match self {
+            ColumnIndex::Empty { .. } => {}
+            ColumnIndex::Full => {
+                doc_ids_out.extend_from_slice(doc_ids);
+                row_ids.extend_from_slice(doc_ids);
+            }
+            ColumnIndex::Optional(optional_index) => {
+                for doc_id in doc_ids {
+                    if let Some(row_id) = optional_index.rank_if_exists(*doc_id) {
+                        doc_ids_out.push(*doc_id);
+                        row_ids.push(row_id);
+                    }
+                }
+            }
            ColumnIndex::Multivalued(multivalued_index) => {
-                let multivalued_index_ref = &**multivalued_index;
-                let start: u32 = multivalued_index_ref.get_val(row_id);
-                let end: u32 = multivalued_index_ref.get_val(row_id + 1);
-                start..end
+                for doc_id in doc_ids {
+                    for row_id in multivalued_index.range(*doc_id) {
+                        doc_ids_out.push(*doc_id);
+                        row_ids.push(row_id);
+                    }
+                }
+            }
+        }
+    }
+
+    pub fn docid_range_to_rowids(&self, doc_id: Range<DocId>) -> Range<RowId> {
+        match self {
+            ColumnIndex::Empty { .. } => 0..0,
+            ColumnIndex::Full => doc_id,
+            ColumnIndex::Optional(optional_index) => {
+                let row_start = optional_index.rank(doc_id.start);
+                let row_end = optional_index.rank(doc_id.end);
+                row_start..row_end
+            }
+            ColumnIndex::Multivalued(multivalued_index) => {
+                let end_docid = doc_id.end.min(multivalued_index.num_docs() - 1) + 1;
+                let start_docid = doc_id.start.min(end_docid);
+
+                let row_start = multivalued_index.start_index_column.get_val(start_docid);
+                let row_end = multivalued_index.start_index_column.get_val(end_docid);
+
+                row_start..row_end
+            }
+        }
+    }
+
+    pub fn select_batch_in_place(&self, doc_id_start: DocId, rank_ids: &mut Vec<RowId>) {
+        match self {
+            ColumnIndex::Empty { .. } => {
+                rank_ids.clear();
+            }
+            ColumnIndex::Full => {
+                // No need to do anything:
+                // value_idx and row_idx are the same.
+            }
+            ColumnIndex::Optional(optional_index) => {
+                optional_index.select_batch(&mut rank_ids[..]);
+            }
+            ColumnIndex::Multivalued(multivalued_index) => {
+                multivalued_index.select_batch_in_place(doc_id_start, rank_ids)
            }
        }
    }
 }
+
+#[cfg(test)]
+mod tests {
+    use crate::{Cardinality, ColumnIndex};
+
+    #[test]
+    fn test_column_index_get_cardinality() {
+        assert_eq!(
+            ColumnIndex::Empty { num_docs: 0 }.get_cardinality(),
+            Cardinality::Full
+        );
+        assert_eq!(ColumnIndex::Full.get_cardinality(), Cardinality::Full);
+        assert_eq!(
+            ColumnIndex::Empty { num_docs: 1 }.get_cardinality(),
+            Cardinality::Optional
+        );
+    }
+}
--- a/columnar/src/column_index/multivalued_index.rs
+++ b/columnar/src/column_index/multivalued_index.rs
@@ -1,29 +1,144 @@
 use std::io;
 use std::io::Write;
+use std::ops::Range;
 use std::sync::Arc;

 use common::OwnedBytes;

-use crate::column_values::{ColumnValues, FastFieldCodecType};
-use crate::RowId;
-
-#[derive(Clone)]
-pub struct MultivaluedIndex(Arc<dyn ColumnValues<RowId>>);
+use crate::column_values::{
+    load_u64_based_column_values, serialize_u64_based_column_values, CodecType, ColumnValues,
+};
+use crate::iterable::Iterable;
+use crate::{DocId, RowId};

 pub fn serialize_multivalued_index(
-    multivalued_index: &dyn ColumnValues<RowId>,
+    multivalued_index: &dyn Iterable<RowId>,
    output: &mut impl Write,
 ) -> io::Result<()> {
-    crate::column_values::serialize_column_values(
-        &*multivalued_index,
-        &[FastFieldCodecType::Bitpacked, FastFieldCodecType::Linear],
+    serialize_u64_based_column_values(
+        multivalued_index,
+        &[CodecType::Bitpacked, CodecType::Linear],
        output,
    )?;
    Ok(())
 }

-pub fn open_multivalued_index(bytes: OwnedBytes) -> io::Result<Arc<dyn ColumnValues<RowId>>> {
-    let start_index_column: Arc<dyn ColumnValues<RowId>> =
-        crate::column_values::open_u64_mapped(bytes)?;
-    Ok(start_index_column)
+pub fn open_multivalued_index(bytes: OwnedBytes) -> io::Result<MultiValueIndex> {
+    let start_index_column: Arc<dyn ColumnValues<RowId>> = load_u64_based_column_values(bytes)?;
+    Ok(MultiValueIndex { start_index_column })
+}
+
+#[derive(Clone)]
+/// Index to resolve value range for given doc_id.
+/// Starts at 0.
+pub struct MultiValueIndex {
+    pub start_index_column: Arc<dyn crate::ColumnValues<RowId>>,
+}
+
+impl std::fmt::Debug for MultiValueIndex {
+    fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
+        f.debug_struct("MultiValuedIndex")
+            .field("num_rows", &self.start_index_column.num_vals())
+            .finish_non_exhaustive()
+    }
+}
+
+impl From<Arc<dyn ColumnValues<RowId>>> for MultiValueIndex {
+    fn from(start_index_column: Arc<dyn ColumnValues<RowId>>) -> Self {
+        MultiValueIndex { start_index_column }
+    }
+}
+
+impl MultiValueIndex {
+    pub fn for_test(start_offsets: &[RowId]) -> MultiValueIndex {
+        let mut buffer = Vec::new();
+        serialize_multivalued_index(&start_offsets, &mut buffer).unwrap();
+        let bytes = OwnedBytes::new(buffer);
+        open_multivalued_index(bytes).unwrap()
+    }
+
+    /// Returns `[start, end)`, such that the values associated with
+    /// the given document are `start..end`.
+    #[inline]
+    pub(crate) fn range(&self, doc_id: DocId) -> Range<RowId> {
+        let start = self.start_index_column.get_val(doc_id);
+        let end = self.start_index_column.get_val(doc_id + 1);
+        start..end
+    }
+
+    /// Returns the number of documents in the index.
+    #[inline]
+    pub fn num_docs(&self) -> u32 {
+        self.start_index_column.num_vals() - 1
+    }
+
+    /// Converts a list of ranks (row ids of values) in a 1:n index to the corresponding list of
+    /// docids. Positions are converted inplace to docids.
+    ///
+    /// Since there is no index for value pos -> docid, but docid -> value pos range, we scan the
+    /// index.
+    ///
+    /// Correctness: positions needs to be sorted. idx_reader needs to contain monotonically
+    /// increasing positions.
+    ///
+    /// TODO: Instead of a linear scan we can employ a exponential search into binary search to
+    /// match a docid to its value position.
+    #[allow(clippy::bool_to_int_with_if)]
+    pub(crate) fn select_batch_in_place(&self, docid_start: DocId, ranks: &mut Vec<u32>) {
+        if ranks.is_empty() {
+            return;
+        }
+        let mut cur_doc = docid_start;
+        let mut last_doc = None;
+
+        assert!(self.start_index_column.get_val(docid_start) <= ranks[0]);
+
+        let mut write_doc_pos = 0;
+        for i in 0..ranks.len() {
+            let pos = ranks[i];
+            loop {
+                let end = self.start_index_column.get_val(cur_doc + 1);
+                if end > pos {
+                    ranks[write_doc_pos] = cur_doc;
+                    write_doc_pos += if last_doc == Some(cur_doc) { 0 } else { 1 };
+                    last_doc = Some(cur_doc);
+                    break;
+                }
+                cur_doc += 1;
+            }
+        }
+        ranks.truncate(write_doc_pos);
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use std::ops::Range;
+
+    use super::MultiValueIndex;
+
+    fn index_to_pos_helper(
+        index: &MultiValueIndex,
+        doc_id_range: Range<u32>,
+        positions: &[u32],
+    ) -> Vec<u32> {
+        let mut positions = positions.to_vec();
+        index.select_batch_in_place(doc_id_range.start, &mut positions);
+        positions
+    }
+
+    #[test]
+    fn test_positions_to_docid() {
+        let index = MultiValueIndex::for_test(&[0, 10, 12, 15, 22, 23]);
+        assert_eq!(index.num_docs(), 5);
+        let positions = &[10u32, 11, 15, 20, 21, 22];
+        assert_eq!(index_to_pos_helper(&index, 0..5, positions), vec![1, 3, 4]);
+        assert_eq!(index_to_pos_helper(&index, 1..5, positions), vec![1, 3, 4]);
+        assert_eq!(index_to_pos_helper(&index, 0..5, &[9]), vec![0]);
+        assert_eq!(index_to_pos_helper(&index, 1..5, &[10]), vec![1]);
+        assert_eq!(index_to_pos_helper(&index, 1..5, &[11]), vec![1]);
+        assert_eq!(index_to_pos_helper(&index, 2..5, &[12]), vec![2]);
+        assert_eq!(index_to_pos_helper(&index, 2..5, &[12, 14]), vec![2]);
+        assert_eq!(index_to_pos_helper(&index, 2..5, &[12, 14, 15]), vec![2, 3]);
+    }
 }
--- a/columnar/src/column_index/optional_index/mod.rs
+++ b/columnar/src/column_index/optional_index/mod.rs
@@ -1,17 +1,17 @@
 use std::io::{self, Write};
-use std::ops::Range;
 use std::sync::Arc;

 mod set;
 mod set_block;

-use common::{BinarySerializable, GroupByIteratorExtended, OwnedBytes, VInt};
-pub use set::{Set, SetCodec};
+use common::{BinarySerializable, OwnedBytes, VInt};
+pub use set::{SelectCursor, Set, SetCodec};
 use set_block::{
    DenseBlock, DenseBlockCodec, SparseBlock, SparseBlockCodec, DENSE_BLOCK_NUM_BYTES,
 };

-use crate::{InvalidData, RowId};
+use crate::iterable::Iterable;
+use crate::{DocId, InvalidData, RowId};

 /// The threshold for for number of elements after which we switch to dense block encoding.
 ///
@@ -88,13 +88,12 @@ pub struct OptionalIndex {
    block_metas: Arc<[BlockMeta]>,
 }

-impl OptionalIndex {
-    pub fn num_rows(&self) -> RowId {
-        self.num_rows
-    }
-
-    pub fn num_non_nulls(&self) -> RowId {
-        self.num_non_null_rows
+impl std::fmt::Debug for OptionalIndex {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        f.debug_struct("OptionalIndex")
+            .field("num_rows", &self.num_rows)
+            .field("num_non_null_rows", &self.num_non_null_rows)
+            .finish_non_exhaustive()
    }
 }

@@ -115,7 +114,63 @@ fn row_addr_from_row_id(row_id: RowId) -> RowAddr {
    }
 }

+enum BlockSelectCursor<'a> {
+    Dense(<DenseBlock<'a> as Set<u16>>::SelectCursor<'a>),
+    Sparse(<SparseBlock<'a> as Set<u16>>::SelectCursor<'a>),
+}
+
+impl<'a> BlockSelectCursor<'a> {
+    fn select(&mut self, rank: u16) -> u16 {
+        match self {
+            BlockSelectCursor::Dense(dense_select_cursor) => dense_select_cursor.select(rank),
+            BlockSelectCursor::Sparse(sparse_select_cursor) => sparse_select_cursor.select(rank),
+        }
+    }
+}
+pub struct OptionalIndexSelectCursor<'a> {
+    current_block_cursor: BlockSelectCursor<'a>,
+    current_block_id: u16,
+    // The current block is guaranteed to contain ranks < end_rank.
+    current_block_end_rank: RowId,
+    optional_index: &'a OptionalIndex,
+    block_doc_idx_start: RowId,
+    num_null_rows_before_block: RowId,
+}
+
+impl<'a> OptionalIndexSelectCursor<'a> {
+    fn search_and_load_block(&mut self, rank: RowId) {
+        if rank < self.current_block_end_rank {
+            // we are already in the right block
+            return;
+        }
+        self.current_block_id = self.optional_index.find_block(rank, self.current_block_id);
+        self.current_block_end_rank = self
+            .optional_index
+            .block_metas
+            .get(self.current_block_id as usize + 1)
+            .map(|block_meta| block_meta.non_null_rows_before_block)
+            .unwrap_or(u32::MAX);
+        self.block_doc_idx_start = (self.current_block_id as u32) * ELEMENTS_PER_BLOCK;
+        let block_meta = self.optional_index.block_metas[self.current_block_id as usize];
+        self.num_null_rows_before_block = block_meta.non_null_rows_before_block;
+        let block: Block<'_> = self.optional_index.block(block_meta);
+        self.current_block_cursor = match block {
+            Block::Dense(dense_block) => BlockSelectCursor::Dense(dense_block.select_cursor()),
+            Block::Sparse(sparse_block) => BlockSelectCursor::Sparse(sparse_block.select_cursor()),
+        };
+    }
+}
+
+impl<'a> SelectCursor<RowId> for OptionalIndexSelectCursor<'a> {
+    fn select(&mut self, rank: RowId) -> RowId {
+        self.search_and_load_block(rank);
+        let index_in_block = (rank - self.num_null_rows_before_block) as u16;
+        self.current_block_cursor.select(index_in_block) as RowId + self.block_doc_idx_start
+    }
+}
+
 impl Set<RowId> for OptionalIndex {
+    type SelectCursor<'b> = OptionalIndexSelectCursor<'b> where Self: 'b;
    // Check if value at position is not null.
    #[inline]
    fn contains(&self, row_id: RowId) -> bool {
@@ -131,11 +186,26 @@ impl Set<RowId> for OptionalIndex {
    }

    #[inline]
-    fn rank_if_exists(&self, row_id: RowId) -> Option<RowId> {
+    fn rank(&self, doc_id: DocId) -> RowId {
        let RowAddr {
            block_id,
            in_block_row_id,
-        } = row_addr_from_row_id(row_id);
+        } = row_addr_from_row_id(doc_id);
+        let block_meta = self.block_metas[block_id as usize];
+        let block = self.block(block_meta);
+        let block_offset_row_id = match block {
+            Block::Dense(dense_block) => dense_block.rank(in_block_row_id),
+            Block::Sparse(sparse_block) => sparse_block.rank(in_block_row_id),
+        } as u32;
+        block_meta.non_null_rows_before_block + block_offset_row_id
+    }
+
+    #[inline]
+    fn rank_if_exists(&self, doc_id: DocId) -> Option<RowId> {
+        let RowAddr {
+            block_id,
+            in_block_row_id,
+        } = row_addr_from_row_id(doc_id);
        let block_meta = self.block_metas[block_id as usize];
        let block = self.block(block_meta);
        let block_offset_row_id = match block {
@@ -148,7 +218,7 @@ impl Set<RowId> for OptionalIndex {
    #[inline]
    fn select(&self, rank: RowId) -> RowId {
        let block_pos = self.find_block(rank, 0);
-        let block_doc_idx_start = block_pos * ELEMENTS_PER_BLOCK;
+        let block_doc_idx_start = (block_pos as u32) * ELEMENTS_PER_BLOCK;
        let block_meta = self.block_metas[block_pos as usize];
        let block: Block<'_> = self.block(block_meta);
        let index_in_block = (rank - block_meta.non_null_rows_before_block) as u16;
@@ -159,41 +229,55 @@ impl Set<RowId> for OptionalIndex {
        block_doc_idx_start + in_block_rank as u32
    }

-    fn select_batch(&self, ranks: &[u32], output_idxs: &mut [u32]) {
-        let mut block_pos = 0u32;
-        let mut start = 0;
-        let group_by_it = ranks.iter().copied().group_by(move |codec_idx| {
-            block_pos = self.find_block(*codec_idx, block_pos);
-            block_pos
-        });
-        for (block_pos, block_iter) in group_by_it {
-            let block_doc_idx_start = block_pos * ELEMENTS_PER_BLOCK;
-            let block_meta = self.block_metas[block_pos as usize];
-            let block: Block<'_> = self.block(block_meta);
-            let offset = block_meta.non_null_rows_before_block;
-            let indexes_in_block_iter =
-                block_iter.map(move |codec_idx| (codec_idx - offset) as u16);
-            match block {
-                Block::Dense(dense_block) => {
-                    for in_offset in dense_block.select_iter(indexes_in_block_iter) {
-                        output_idxs[start] = in_offset as u32 + block_doc_idx_start;
-                        start += 1;
-                    }
-                }
-                Block::Sparse(sparse_block) => {
-                    for in_offset in sparse_block.select_iter(indexes_in_block_iter) {
-                        output_idxs[start] = in_offset as u32 + block_doc_idx_start;
-                        start += 1;
-                    }
-                }
-            };
+    fn select_cursor(&self) -> OptionalIndexSelectCursor<'_> {
+        OptionalIndexSelectCursor {
+            current_block_cursor: BlockSelectCursor::Sparse(
+                SparseBlockCodec::open(b"").select_cursor(),
+            ),
+            current_block_id: 0u16,
+            current_block_end_rank: 0u32, //< this is sufficient to force the first load
+            optional_index: self,
+            block_doc_idx_start: 0u32,
+            num_null_rows_before_block: 0u32,
        }
    }
 }

 impl OptionalIndex {
+    pub fn for_test(num_rows: RowId, row_ids: &[RowId]) -> OptionalIndex {
+        assert!(row_ids
+            .last()
+            .copied()
+            .map(|last_row_id| last_row_id < num_rows)
+            .unwrap_or(true));
+        let mut buffer = Vec::new();
+        serialize_optional_index(&row_ids, num_rows, &mut buffer).unwrap();
+        let bytes = OwnedBytes::new(buffer);
+        open_optional_index(bytes).unwrap()
+    }
+
+    pub fn num_docs(&self) -> RowId {
+        self.num_rows
+    }
+
+    pub fn num_non_nulls(&self) -> RowId {
+        self.num_non_null_rows
+    }
+
+    pub fn iter_rows(&self) -> impl Iterator<Item = RowId> + '_ {
+        // TODO optimize
+        let mut select_batch = self.select_cursor();
+        (0..self.num_non_null_rows).map(move |rank| select_batch.select(rank))
+    }
+    pub fn select_batch(&self, ranks: &mut [RowId]) {
+        let mut select_cursor = self.select_cursor();
+        for rank in ranks.iter_mut() {
+            *rank = select_cursor.select(*rank);
+        }
+    }
+
    #[inline]
-    fn block<'a>(&'a self, block_meta: BlockMeta) -> Block<'a> {
+    fn block(&self, block_meta: BlockMeta) -> Block<'_> {
        let BlockMeta {
            start_byte_offset,
            block_variant,
@@ -214,14 +298,14 @@ impl OptionalIndex {
    }

    #[inline]
-    fn find_block(&self, dense_idx: u32, start_block_pos: u32) -> u32 {
-        for block_pos in start_block_pos..self.block_metas.len() as u32 {
+    fn find_block(&self, dense_idx: u32, start_block_pos: u16) -> u16 {
+        for block_pos in start_block_pos..self.block_metas.len() as u16 {
            let offset = self.block_metas[block_pos as usize].non_null_rows_before_block;
            if offset > dense_idx {
-                return block_pos - 1;
+                return block_pos - 1u16;
            }
        }
-        self.block_metas.len() as u32 - 1u32
+        self.block_metas.len() as u16 - 1u16
    }

    // TODO Add a good API for the codec_idx to original_idx translation.
@@ -255,7 +339,7 @@ impl OptionalIndexCodec {
 }

 impl BinarySerializable for OptionalIndexCodec {
-    fn serialize<W: Write>(&self, writer: &mut W) -> io::Result<()> {
+    fn serialize<W: Write + ?Sized>(&self, writer: &mut W) -> io::Result<()> {
        writer.write_all(&[self.to_code()])
    }

@@ -276,13 +360,14 @@ fn serialize_optional_index_block(block_els: &[u16], out: &mut impl io::Write) -
    Ok(())
 }

-pub fn serialize_optional_index<'a, W: io::Write>(
-    serializable_optional_index: &dyn SerializableOptionalIndex<'a>,
+pub fn serialize_optional_index<W: io::Write>(
+    non_null_rows: &dyn Iterable<RowId>,
+    num_rows: RowId,
    output: &mut W,
 ) -> io::Result<()> {
-    VInt(serializable_optional_index.num_rows() as u64).serialize(output)?;
+    VInt(num_rows as u64).serialize(output)?;

-    let mut rows_it = serializable_optional_index.non_null_rows();
+    let mut rows_it = non_null_rows.boxed_iter();
    let mut block_metadata: Vec<SerializedBlockMeta> = Vec::new();
    let mut current_block = Vec::new();

@@ -351,7 +436,7 @@ impl SerializedBlockMeta {
    }

    #[inline]
-    fn to_bytes(&self) -> [u8; SERIALIZED_BLOCK_META_NUM_BYTES] {
+    fn to_bytes(self) -> [u8; SERIALIZED_BLOCK_META_NUM_BYTES] {
        assert!(self.num_non_null_rows > 0);
        let mut bytes = [0u8; SERIALIZED_BLOCK_META_NUM_BYTES];
        bytes[0..2].copy_from_slice(&self.block_id.to_le_bytes());
@@ -364,7 +449,7 @@ impl SerializedBlockMeta {

 #[inline]
 fn is_sparse(num_rows_in_block: u32) -> bool {
-    num_rows_in_block < DENSE_BLOCK_THRESHOLD as u32
+    num_rows_in_block < DENSE_BLOCK_THRESHOLD
 }

 fn deserialize_optional_index_block_metadatas(
@@ -372,7 +457,7 @@ fn deserialize_optional_index_block_metadatas(
    num_rows: u32,
 ) -> (Box<[BlockMeta]>, u32) {
    let num_blocks = data.len() / SERIALIZED_BLOCK_META_NUM_BYTES;
-    let mut block_metas = Vec::with_capacity(num_blocks as usize + 1);
+    let mut block_metas = Vec::with_capacity(num_blocks + 1);
    let mut start_byte_offset = 0;
    let mut non_null_rows_before_block = 0;
    for block_meta_bytes in data.chunks_exact(SERIALIZED_BLOCK_META_NUM_BYTES) {
@@ -403,7 +488,7 @@ fn deserialize_optional_index_block_metadatas(
            block_variant,
        });
        start_byte_offset += block_variant.num_bytes_in_block();
-        non_null_rows_before_block += num_non_null_rows as u32;
+        non_null_rows_before_block += num_non_null_rows;
    }
    block_metas.resize(
        ((num_rows + BLOCK_SIZE - 1) / BLOCK_SIZE) as usize,
@@ -425,7 +510,7 @@ pub fn open_optional_index(bytes: OwnedBytes) -> io::Result<OptionalIndex> {
        num_non_empty_block_bytes as usize * SERIALIZED_BLOCK_META_NUM_BYTES;
    let (block_data, block_metas) = bytes.rsplit(block_metas_num_bytes);
    let (block_metas, num_non_null_rows) =
-        deserialize_optional_index_block_metadatas(block_metas.as_slice(), num_rows).into();
+        deserialize_optional_index_block_metadatas(block_metas.as_slice(), num_rows);
    let optional_index = OptionalIndex {
        num_rows,
        num_non_null_rows,
@@ -435,19 +520,5 @@ pub fn open_optional_index(bytes: OwnedBytes) -> io::Result<OptionalIndex> {
    Ok(optional_index)
 }

-pub trait SerializableOptionalIndex<'a> {
-    fn num_rows(&self) -> RowId;
-    fn non_null_rows(&self) -> Box<dyn Iterator<Item = RowId> + 'a>;
-}
-
-impl SerializableOptionalIndex<'static> for Range<u32> {
-    fn num_rows(&self) -> RowId {
-        self.end
-    }
-    fn non_null_rows(&self) -> Box<dyn Iterator<Item = RowId> + 'static> {
-        Box::new(self.clone())
-    }
-}
-
 #[cfg(test)]
 mod tests;
--- a/columnar/src/column_index/optional_index/set.rs
+++ b/columnar/src/column_index/optional_index/set.rs
@@ -10,14 +10,28 @@ pub trait SetCodec {
    ///
    /// May panic if the elements are not sorted.
    fn serialize(els: impl Iterator<Item = Self::Item>, wrt: impl io::Write) -> io::Result<()>;
-    fn open<'a>(data: &'a [u8]) -> Self::Reader<'a>;
+    fn open(data: &[u8]) -> Self::Reader<'_>;
+}
+
+/// Stateful object that makes it possible to compute several select in a row,
+/// provided the rank passed as argument are increasing.
+pub trait SelectCursor<T> {
+    // May panic if rank is greater than the number of elements in the Set,
+    // or if rank is < than value provided in the previous call.
+    fn select(&mut self, rank: T) -> T;
 }

 pub trait Set<T> {
+    type SelectCursor<'b>: SelectCursor<T>
+    where Self: 'b;
+
    /// Returns true if the elements is contained in the Set
    fn contains(&self, el: T) -> bool;

-    /// If the set contains `el` returns its position in the sortd set of elements.
+    /// Returns the number of rows in the set that are < `el`
+    fn rank(&self, el: T) -> T;
+
+    /// If the set contains `el` returns the element rank.
    /// If the set does not contain the element, it returns `None`.
    fn rank_if_exists(&self, el: T) -> Option<T>;

@@ -28,11 +42,6 @@ pub trait Set<T> {
    /// May panic if rank is greater than the number of elements in the Set.
    fn select(&self, rank: T) -> T;

-    /// Batch version of select.
-    /// `ranks` is assumed to be sorted.
-    ///
-    /// # Panics
-    ///
-    /// May panic if rank is greater than the number of elements in the Set.
-    fn select_batch(&self, ranks: &[T], outputs: &mut [T]);
+    /// Creates a brand new select cursor.
+    fn select_cursor(&self) -> Self::SelectCursor<'_>;
 }
--- a/columnar/src/column_index/optional_index/set_block/dense.rs
+++ b/columnar/src/column_index/optional_index/set_block/dense.rs
@@ -3,7 +3,7 @@ use std::io::{self, Write};

 use common::BinarySerializable;

-use crate::column_index::optional_index::{Set, SetCodec, ELEMENTS_PER_BLOCK};
+use crate::column_index::optional_index::{SelectCursor, Set, SetCodec, ELEMENTS_PER_BLOCK};

 #[inline(always)]
 fn get_bit_at(input: u64, n: u16) -> bool {
@@ -32,7 +32,7 @@ pub const MINI_BLOCK_NUM_BYTES: usize = MINI_BLOCK_BITVEC_NUM_BYTES + MINI_BLOCK

 /// Number of bytes in a dense block.
 pub const DENSE_BLOCK_NUM_BYTES: u32 =
-    (ELEMENTS_PER_BLOCK as u32 / ELEMENTS_PER_MINI_BLOCK as u32) * MINI_BLOCK_NUM_BYTES as u32;
+    (ELEMENTS_PER_BLOCK / ELEMENTS_PER_MINI_BLOCK as u32) * MINI_BLOCK_NUM_BYTES as u32;

 pub struct DenseBlockCodec;

@@ -45,7 +45,7 @@ impl SetCodec for DenseBlockCodec {
    }

    #[inline]
-    fn open<'a>(data: &'a [u8]) -> Self::Reader<'a> {
+    fn open(data: &[u8]) -> Self::Reader<'_> {
        assert_eq!(data.len(), DENSE_BLOCK_NUM_BYTES as usize);
        DenseBlock(data)
    }
@@ -94,7 +94,7 @@ impl DenseMiniBlock {
        Self { bitvec, rank }
    }

-    fn to_bytes(&self) -> [u8; MINI_BLOCK_NUM_BYTES] {
+    fn to_bytes(self) -> [u8; MINI_BLOCK_NUM_BYTES] {
        let mut bytes = [0u8; MINI_BLOCK_NUM_BYTES];
        bytes[..MINI_BLOCK_BITVEC_NUM_BYTES].copy_from_slice(&self.bitvec.to_le_bytes());
        bytes[MINI_BLOCK_BITVEC_NUM_BYTES..].copy_from_slice(&self.rank.to_le_bytes());
@@ -105,7 +105,27 @@ impl DenseMiniBlock {
 #[derive(Copy, Clone)]
 pub struct DenseBlock<'a>(&'a [u8]);

+pub struct DenseBlockSelectCursor<'a> {
+    block_id: u16,
+    dense_block: DenseBlock<'a>,
+}
+
+impl<'a> SelectCursor<u16> for DenseBlockSelectCursor<'a> {
+    #[inline]
+    fn select(&mut self, rank: u16) -> u16 {
+        self.block_id = self
+            .dense_block
+            .find_miniblock_containing_rank(rank, self.block_id)
+            .unwrap();
+        let index_block = self.dense_block.mini_block(self.block_id);
+        let in_block_rank = rank - index_block.rank;
+        self.block_id * ELEMENTS_PER_MINI_BLOCK + select_u64(index_block.bitvec, in_block_rank)
+    }
+}
+
 impl<'a> Set<u16> for DenseBlock<'a> {
+    type SelectCursor<'b> = DenseBlockSelectCursor<'a> where Self: 'b;
+
    #[inline(always)]
    fn contains(&self, el: u16) -> bool {
        let mini_block_id = el / ELEMENTS_PER_MINI_BLOCK;
@@ -128,6 +148,15 @@ impl<'a> Set<u16> for DenseBlock<'a> {
        }
    }

+    #[inline(always)]
+    fn rank(&self, el: u16) -> u16 {
+        let block_pos = el / ELEMENTS_PER_MINI_BLOCK;
+        let index_block = self.mini_block(block_pos);
+        let pos_in_block_bit_vec = el % ELEMENTS_PER_MINI_BLOCK;
+        let ones_in_block = rank_u64(index_block.bitvec, pos_in_block_bit_vec);
+        index_block.rank + ones_in_block
+    }
+
    #[inline(always)]
    fn select(&self, rank: u16) -> u16 {
        let block_id = self.find_miniblock_containing_rank(rank, 0).unwrap();
@@ -136,37 +165,15 @@ impl<'a> Set<u16> for DenseBlock<'a> {
        block_id * ELEMENTS_PER_MINI_BLOCK + select_u64(index_block.bitvec, in_block_rank)
    }

-    fn select_batch(&self, ranks: &[u16], outputs: &mut [u16]) {
-        let orig_ids = self.select_iter(ranks.iter().copied());
-        for (output, original_id) in outputs.iter_mut().zip(orig_ids) {
-            *output = original_id;
+    #[inline(always)]
+    fn select_cursor(&self) -> Self::SelectCursor<'_> {
+        DenseBlockSelectCursor {
+            block_id: 0,
+            dense_block: *self,
        }
    }
 }

-impl<'a> DenseBlock<'a> {
-    /// Iterator verison of select.
-    ///
-    /// # Panics
-    /// Panics if one of the rank is higher than the number of elements in the set.
-    pub fn select_iter<'b>(
-        &self,
-        rank_it: impl Iterator<Item = u16> + 'b,
-    ) -> impl Iterator<Item = u16> + 'b
-    where
-        Self: 'b,
-    {
-        let mut block_id = 0u16;
-        let me = *self;
-        rank_it.map(move |rank| {
-            block_id = me.find_miniblock_containing_rank(rank, block_id).unwrap();
-            let index_block = me.mini_block(block_id);
-            let in_block_rank = rank - index_block.rank;
-            block_id * ELEMENTS_PER_MINI_BLOCK + select_u64(index_block.bitvec, in_block_rank)
-        })
-    }
-}
-
 impl<'a> DenseBlock<'a> {
    #[inline]
    fn mini_block(&self, mini_block_id: u16) -> DenseMiniBlock {
@@ -222,7 +229,7 @@ pub fn serialize_dense_codec(
        while block_id > current_block_id {
            let dense_mini_block = DenseMiniBlock {
                bitvec: block,
-                rank: non_null_rows_before as u16,
+                rank: non_null_rows_before,
            };
            output.write_all(&dense_mini_block.to_bytes())?;
            non_null_rows_before += block.count_ones() as u16;
--- a/columnar/src/column_index/optional_index/set_block/sparse.rs
+++ b/columnar/src/column_index/optional_index/set_block/sparse.rs
@@ -1,4 +1,4 @@
-use crate::column_index::optional_index::{Set, SetCodec};
+use crate::column_index::optional_index::{SelectCursor, Set, SetCodec};

 pub struct SparseBlockCodec;

@@ -16,7 +16,7 @@ impl SetCodec for SparseBlockCodec {
        Ok(())
    }

-    fn open<'a>(data: &'a [u8]) -> Self::Reader<'a> {
+    fn open(data: &[u8]) -> Self::Reader<'_> {
        SparseBlock(data)
    }
 }
@@ -24,7 +24,16 @@ impl SetCodec for SparseBlockCodec {
 #[derive(Copy, Clone)]
 pub struct SparseBlock<'a>(&'a [u8]);

+impl<'a> SelectCursor<u16> for SparseBlock<'a> {
+    #[inline]
+    fn select(&mut self, rank: u16) -> u16 {
+        <SparseBlock<'a> as Set<u16>>::select(self, rank)
+    }
+}
+
 impl<'a> Set<u16> for SparseBlock<'a> {
+    type SelectCursor<'b> = Self where Self: 'b;
+
    #[inline(always)]
    fn contains(&self, el: u16) -> bool {
        self.binary_search(el).is_ok()
@@ -35,17 +44,20 @@ impl<'a> Set<u16> for SparseBlock<'a> {
        self.binary_search(el).ok()
    }

+    #[inline(always)]
+    fn rank(&self, el: u16) -> u16 {
+        self.binary_search(el).unwrap_or_else(|el| el)
+    }
+
    #[inline(always)]
    fn select(&self, rank: u16) -> u16 {
        let offset = rank as usize * 2;
        u16::from_le_bytes(self.0[offset..offset + 2].try_into().unwrap())
    }

-    fn select_batch(&self, ranks: &[u16], outputs: &mut [u16]) {
-        let orig_ids = self.select_iter(ranks.iter().copied());
-        for (output, original_id) in outputs.iter_mut().zip(orig_ids) {
-            *output = original_id;
-        }
+    #[inline(always)]
+    fn select_cursor(&self) -> Self::SelectCursor<'_> {
+        *self
    }
 }

@@ -96,17 +108,4 @@ impl<'a> SparseBlock<'a> {
        }
        Err(left)
    }
-
-    pub fn select_iter<'b>(
-        &self,
-        iter: impl Iterator<Item = u16> + 'b,
-    ) -> impl Iterator<Item = u16> + 'b
-    where
-        Self: 'b,
-    {
-        iter.map(|codec_id| {
-            let offset = codec_id as usize * 2;
-            u16::from_le_bytes(self.0[offset..offset + 2].try_into().unwrap())
-        })
-    }
 }
--- a/columnar/src/column_index/optional_index/set_block/tests.rs
+++ b/columnar/src/column_index/optional_index/set_block/tests.rs
@@ -1,9 +1,8 @@
 use std::collections::HashMap;

-use crate::column_index::optional_index::set_block::{
-    DenseBlockCodec, SparseBlockCodec, DENSE_BLOCK_NUM_BYTES,
-};
-use crate::column_index::optional_index::{Set, SetCodec};
+use crate::column_index::optional_index::set_block::dense::DENSE_BLOCK_NUM_BYTES;
+use crate::column_index::optional_index::set_block::{DenseBlockCodec, SparseBlockCodec};
+use crate::column_index::optional_index::{SelectCursor, Set, SetCodec};

 fn test_set_helper<C: SetCodec<Item = u16>>(vals: &[u16]) -> usize {
    let mut buffer = Vec::new();
@@ -18,6 +17,10 @@ fn test_set_helper<C: SetCodec<Item = u16>>(vals: &[u16]) -> usize {
    for val in 0u16..=u16::MAX {
        assert_eq!(tested_set.contains(val), hash_set.contains_key(&val));
        assert_eq!(tested_set.rank_if_exists(val), hash_set.get(&val).copied());
+        assert_eq!(
+            tested_set.rank(val),
+            vals.iter().cloned().take_while(|v| *v < val).count() as u16
+        );
    }
    for rank in 0..vals.len() {
        assert_eq!(tested_set.select(rank as u16), vals[rank]);
@@ -75,12 +78,10 @@ fn test_simple_translate_codec_codec_idx_to_original_idx_dense() {
        .unwrap();
    let tested_set = DenseBlockCodec::open(buffer.as_slice());
    assert!(tested_set.contains(1));
-    assert_eq!(
-        &tested_set
-            .select_iter([0, 1, 2, 5].iter().copied())
-            .collect::<Vec<u16>>(),
-        &[1, 3, 17, 30_001]
-    );
+    let mut select_cursor = tested_set.select_cursor();
+    assert_eq!(select_cursor.select(0), 1);
+    assert_eq!(select_cursor.select(1), 3);
+    assert_eq!(select_cursor.select(2), 17);
 }

 #[test]
@@ -89,12 +90,10 @@ fn test_simple_translate_codec_idx_to_original_idx_sparse() {
    SparseBlockCodec::serialize([1, 3, 17].iter().copied(), &mut buffer).unwrap();
    let tested_set = SparseBlockCodec::open(buffer.as_slice());
    assert!(tested_set.contains(1));
-    assert_eq!(
-        &tested_set
-            .select_iter([0, 1, 2].iter().copied())
-            .collect::<Vec<u16>>(),
-        &[1, 3, 17]
-    );
+    let mut select_cursor = tested_set.select_cursor();
+    assert_eq!(SelectCursor::select(&mut select_cursor, 0), 1);
+    assert_eq!(SelectCursor::select(&mut select_cursor, 1), 3);
+    assert_eq!(SelectCursor::select(&mut select_cursor, 2), 17);
 }

 #[test]
@@ -103,10 +102,8 @@ fn test_simple_translate_codec_idx_to_original_idx_dense() {
    DenseBlockCodec::serialize(0u16..150u16, &mut buffer).unwrap();
    let tested_set = DenseBlockCodec::open(buffer.as_slice());
    assert!(tested_set.contains(1));
-    let rg = 0u16..150u16;
-    let els: Vec<u16> = rg.clone().collect();
-    assert_eq!(
-        &tested_set.select_iter(rg.clone()).collect::<Vec<u16>>(),
-        &els
-    );
+    let mut select_cursor = tested_set.select_cursor();
+    for i in 0..150 {
+        assert_eq!(i, select_cursor.select(i));
+    }
 }
--- a/columnar/src/column_index/optional_index/tests.rs
+++ b/columnar/src/column_index/optional_index/tests.rs
@@ -37,13 +37,14 @@ proptest! {
 fn test_with_random_sets_simple() {
    let vals = 10..BLOCK_SIZE * 2;
    let mut out: Vec<u8> = Vec::new();
-    serialize_optional_index(&vals.clone(), &mut out).unwrap();
+    serialize_optional_index(&vals, 100, &mut out).unwrap();
    let null_index = open_optional_index(OwnedBytes::new(out)).unwrap();
    let ranks: Vec<u32> = (65_472u32..65_473u32).collect();
    let els: Vec<u32> = ranks.iter().copied().map(|rank| rank + 10).collect();
-    let mut output = vec![0u32; ranks.len()];
-    null_index.select_batch(&ranks[..], &mut output[..]);
-    assert_eq!(&output, &els);
+    let mut select_cursor = null_index.select_cursor();
+    for (rank, el) in ranks.iter().copied().zip(els.iter().copied()) {
+        assert_eq!(select_cursor.select(rank), el);
+    }
 }

 #[test]
@@ -65,12 +66,8 @@ fn test_optional_index_one_block_true() {
    test_null_index(&iter[..]);
 }

-impl<'a> SerializableOptionalIndex<'a> for &'a [bool] {
-    fn num_rows(&self) -> RowId {
-        self.len() as u32
-    }
-
-    fn non_null_rows(&self) -> Box<dyn Iterator<Item = RowId> + 'a> {
+impl<'a> Iterable<RowId> for &'a [bool] {
+    fn boxed_iter(&self) -> Box<dyn Iterator<Item = RowId> + 'a> {
        Box::new(
            self.iter()
                .cloned()
@@ -83,7 +80,7 @@ impl<'a> SerializableOptionalIndex<'a> for &'a [bool] {

 fn test_null_index(data: &[bool]) {
    let mut out: Vec<u8> = Vec::new();
-    serialize_optional_index(&data, &mut out).unwrap();
+    serialize_optional_index(&data, data.len() as RowId, &mut out).unwrap();
    let null_index = open_optional_index(OwnedBytes::new(out)).unwrap();
    let orig_idx_with_value: Vec<u32> = data
        .iter()
@@ -91,11 +88,10 @@ fn test_null_index(data: &[bool]) {
        .filter(|(_pos, val)| **val)
        .map(|(pos, _val)| pos as u32)
        .collect();
-    let ids: Vec<u32> = (0..orig_idx_with_value.len() as u32).collect();
-    let mut output = vec![0u32; ids.len()];
-    null_index.select_batch(&ids[..], &mut output);
-    // assert_eq!(&output[0..100], &orig_idx_with_value[0..100]);
-    assert_eq!(output, orig_idx_with_value);
+    let mut select_iter = null_index.select_cursor();
+    for i in 0..orig_idx_with_value.len() {
+        assert_eq!(select_iter.select(i as u32), orig_idx_with_value[i]);
+    }

    let step_size = (orig_idx_with_value.len() / 100).max(1);
    for (dense_idx, orig_idx) in orig_idx_with_value.iter().enumerate().step_by(step_size) {
@@ -111,51 +107,96 @@ fn test_null_index(data: &[bool]) {

 #[test]
 fn test_optional_index_test_translation() {
-    let mut out = vec![];
-    let iter = &[true, false, true, false];
-    serialize_optional_index(&&iter[..], &mut out).unwrap();
-    let null_index = open_optional_index(OwnedBytes::new(out)).unwrap();
-    let mut output = vec![0u32; 2];
-    null_index.select_batch(&[0, 1], &mut output);
-    assert_eq!(output, &[0, 2]);
+    let optional_index = OptionalIndex::for_test(4, &[0, 2]);
+    let mut select_cursor = optional_index.select_cursor();
+    assert_eq!(select_cursor.select(0), 0);
+    assert_eq!(select_cursor.select(1), 2);
 }

 #[test]
 fn test_optional_index_translate() {
-    let mut out = vec![];
-    let iter = &[true, false, true, false];
-    serialize_optional_index(&&iter[..], &mut out).unwrap();
-    let null_index = open_optional_index(OwnedBytes::new(out)).unwrap();
-    assert_eq!(null_index.rank_if_exists(0), Some(0));
-    assert_eq!(null_index.rank_if_exists(2), Some(1));
+    let optional_index = OptionalIndex::for_test(4, &[0, 2]);
+    assert_eq!(optional_index.rank_if_exists(0), Some(0));
+    assert_eq!(optional_index.rank_if_exists(2), Some(1));
 }

 #[test]
 fn test_optional_index_small() {
-    let mut out = vec![];
-    let iter = &[true, false, true, false];
-    serialize_optional_index(&&iter[..], &mut out).unwrap();
-    let null_index = open_optional_index(OwnedBytes::new(out)).unwrap();
-    assert!(null_index.contains(0));
-    assert!(!null_index.contains(1));
-    assert!(null_index.contains(2));
-    assert!(!null_index.contains(3));
+    let optional_index = OptionalIndex::for_test(4, &[0, 2]);
+    assert!(optional_index.contains(0));
+    assert!(!optional_index.contains(1));
+    assert!(optional_index.contains(2));
+    assert!(!optional_index.contains(3));
 }

 #[test]
 fn test_optional_index_large() {
-    let mut docs = vec![];
-    docs.extend((0..ELEMENTS_PER_BLOCK).map(|_idx| false));
-    docs.extend((0..=1).map(|_idx| true));
+    let row_ids = &[ELEMENTS_PER_BLOCK, ELEMENTS_PER_BLOCK + 1];
+    let optional_index = OptionalIndex::for_test(ELEMENTS_PER_BLOCK + 2, row_ids);
+    assert!(!optional_index.contains(0));
+    assert!(!optional_index.contains(100));
+    assert!(!optional_index.contains(ELEMENTS_PER_BLOCK - 1));
+    assert!(optional_index.contains(ELEMENTS_PER_BLOCK));
+    assert!(optional_index.contains(ELEMENTS_PER_BLOCK + 1));
+}

-    let mut out = vec![];
-    serialize_optional_index(&&docs[..], &mut out).unwrap();
-    let null_index = open_optional_index(OwnedBytes::new(out)).unwrap();
-    assert!(!null_index.contains(0));
-    assert!(!null_index.contains(100));
-    assert!(!null_index.contains(ELEMENTS_PER_BLOCK - 1));
-    assert!(null_index.contains(ELEMENTS_PER_BLOCK));
-    assert!(null_index.contains(ELEMENTS_PER_BLOCK + 1));
+fn test_optional_index_iter_aux(row_ids: &[RowId], num_rows: RowId) {
+    let optional_index = OptionalIndex::for_test(num_rows, row_ids);
+    assert_eq!(optional_index.num_docs(), num_rows);
+    assert!(optional_index.iter_rows().eq(row_ids.iter().copied()));
+}
+
+#[test]
+fn test_optional_index_iter_empty() {
+    test_optional_index_iter_aux(&[], 0u32);
+}
+
+fn test_optional_index_rank_aux(row_ids: &[RowId]) {
+    let num_rows = row_ids.last().copied().unwrap_or(0u32) + 1;
+    let null_index = OptionalIndex::for_test(num_rows, row_ids);
+    assert_eq!(null_index.num_docs(), num_rows);
+    for (row_id, row_val) in row_ids.iter().copied().enumerate() {
+        assert_eq!(null_index.rank(row_val), row_id as u32);
+        assert_eq!(null_index.rank_if_exists(row_val), Some(row_id as u32));
+        if row_val > 0 && !null_index.contains(&row_val - 1) {
+            assert_eq!(null_index.rank(row_val - 1), row_id as u32);
+        }
+        assert_eq!(null_index.rank(row_val + 1), row_id as u32 + 1);
+    }
+}
+
+#[test]
+fn test_optional_index_rank() {
+    test_optional_index_rank_aux(&[1u32]);
+    test_optional_index_rank_aux(&[0u32, 1u32]);
+    let mut block = Vec::new();
+    block.push(3u32);
+    block.extend((0..BLOCK_SIZE).map(|i| i + BLOCK_SIZE + 1));
+    test_optional_index_rank_aux(&block);
+}
+
+#[test]
+fn test_optional_index_iter_empty_one() {
+    test_optional_index_iter_aux(&[1], 2u32);
+    test_optional_index_iter_aux(&[100_000], 200_000u32);
+}
+
+#[test]
+fn test_optional_index_iter_dense_block() {
+    let mut block = Vec::new();
+    block.push(3u32);
+    block.extend((0..BLOCK_SIZE).map(|i| i + BLOCK_SIZE + 1));
+    test_optional_index_iter_aux(&block, 3 * BLOCK_SIZE);
+}
+
+#[test]
+fn test_optional_index_for_tests() {
+    let optional_index = OptionalIndex::for_test(4, &[1, 2]);
+    assert!(!optional_index.contains(0));
+    assert!(optional_index.contains(1));
+    assert!(optional_index.contains(2));
+    assert!(!optional_index.contains(3));
+    assert_eq!(optional_index.num_docs(), 4);
 }

 #[cfg(all(test, feature = "unstable"))]
@@ -171,11 +212,13 @@ mod bench {
    fn gen_bools(fill_ratio: f64) -> OptionalIndex {
        let mut out = Vec::new();
        let mut rng: StdRng = StdRng::from_seed([1u8; 32]);
-        let vals: Vec<bool> = (0..TOTAL_NUM_VALUES)
+        let vals: Vec<RowId> = (0..TOTAL_NUM_VALUES)
            .map(|_| rng.gen_bool(fill_ratio))
+            .enumerate()
+            .filter(|(pos, val)| *val)
+            .map(|(pos, _)| pos as RowId)
            .collect();
-        serialize_optional_index(&&vals[..], &mut out).unwrap();
-
+        serialize_optional_index(&&vals[..], TOTAL_NUM_VALUES, &mut out).unwrap();
        let codec = open_optional_index(OwnedBytes::new(out)).unwrap();
        codec
    }
@@ -311,7 +354,8 @@ mod bench {
        };
        let mut output = vec![0u32; idxs.len()];
        bench.iter(|| {
-            codec.select_batch(&idxs[..], &mut output);
+            output.copy_from_slice(&idxs[..]);
+            codec.select_batch(&mut output);
        });
    }

--- a/columnar/src/column_index/serialize.rs
+++ b/columnar/src/column_index/serialize.rs
@@ -5,23 +5,26 @@ use common::{CountingWriter, OwnedBytes};

 use crate::column_index::multivalued_index::serialize_multivalued_index;
 use crate::column_index::optional_index::serialize_optional_index;
-use crate::column_index::{ColumnIndex, SerializableOptionalIndex};
-use crate::column_values::ColumnValues;
+use crate::column_index::ColumnIndex;
+use crate::iterable::Iterable;
 use crate::{Cardinality, RowId};

 pub enum SerializableColumnIndex<'a> {
    Full,
-    Optional(Box<dyn SerializableOptionalIndex<'a> + 'a>),
+    Optional {
+        non_null_row_ids: Box<dyn Iterable<RowId> + 'a>,
+        num_rows: RowId,
+    },
    // TODO remove the Arc<dyn> apart from serialization this is not
    // dynamic at all.
-    Multivalued(Box<dyn ColumnValues<RowId> + 'a>),
+    Multivalued(Box<dyn Iterable<RowId> + 'a>),
 }

 impl<'a> SerializableColumnIndex<'a> {
    pub fn get_cardinality(&self) -> Cardinality {
        match self {
            SerializableColumnIndex::Full => Cardinality::Full,
-            SerializableColumnIndex::Optional(_) => Cardinality::Optional,
+            SerializableColumnIndex::Optional { .. } => Cardinality::Optional,
            SerializableColumnIndex::Multivalued(_) => Cardinality::Multivalued,
        }
    }
@@ -36,9 +39,10 @@ pub fn serialize_column_index(
    output.write_all(&[cardinality])?;
    match column_index {
        SerializableColumnIndex::Full => {}
-        SerializableColumnIndex::Optional(optional_index) => {
-            serialize_optional_index(&*optional_index, &mut output)?
-        }
+        SerializableColumnIndex::Optional {
+            non_null_row_ids,
+            num_rows,
+        } => serialize_optional_index(non_null_row_ids.as_ref(), num_rows, &mut output)?,
        SerializableColumnIndex::Multivalued(multivalued_index) => {
            serialize_multivalued_index(&*multivalued_index, &mut output)?
        }
@@ -47,7 +51,7 @@ pub fn serialize_column_index(
    Ok(column_index_num_bytes)
 }

-pub fn open_column_index(mut bytes: OwnedBytes) -> io::Result<ColumnIndex<'static>> {
+pub fn open_column_index(mut bytes: OwnedBytes) -> io::Result<ColumnIndex> {
    if bytes.is_empty() {
        return Err(io::Error::new(
            io::ErrorKind::UnexpectedEof,
@@ -64,8 +68,8 @@ pub fn open_column_index(mut bytes: OwnedBytes) -> io::Result<ColumnIndex<'stati
            Ok(ColumnIndex::Optional(optional_index))
        }
        Cardinality::Multivalued => {
-            let multivalued_index = super::multivalued_index::open_multivalued_index(bytes)?;
-            Ok(ColumnIndex::Multivalued(multivalued_index))
+            let multivalue_index = super::multivalued_index::open_multivalued_index(bytes)?;
+            Ok(ColumnIndex::Multivalued(multivalue_index))
        }
    }
 }
--- a/columnar/src/column_values/bench.rs
+++ b/columnar/src/column_values/bench.rs
@@ -0,0 +1,135 @@
+use std::sync::Arc;
+
+use common::OwnedBytes;
+use rand::rngs::StdRng;
+use rand::{Rng, SeedableRng};
+use test::{self, Bencher};
+
+use super::*;
+use crate::column_values::u64_based::*;
+
+fn get_data() -> Vec<u64> {
+    let mut rng = StdRng::seed_from_u64(2u64);
+    let mut data: Vec<_> = (100..55000_u64)
+        .map(|num| num + rng.gen::<u8>() as u64)
+        .collect();
+    data.push(99_000);
+    data.insert(1000, 2000);
+    data.insert(2000, 100);
+    data.insert(3000, 4100);
+    data.insert(4000, 100);
+    data.insert(5000, 800);
+    data
+}
+
+fn compute_stats(vals: impl Iterator<Item = u64>) -> ColumnStats {
+    let mut stats_collector = StatsCollector::default();
+    for val in vals {
+        stats_collector.collect(val);
+    }
+    stats_collector.stats()
+}
+
+#[inline(never)]
+fn value_iter() -> impl Iterator<Item = u64> {
+    0..20_000
+}
+fn get_reader_for_bench<Codec: ColumnCodec>(data: &[u64]) -> Codec::ColumnValues {
+    let mut bytes = Vec::new();
+    let stats = compute_stats(data.iter().cloned());
+    let mut codec_serializer = Codec::estimator();
+    for val in data {
+        codec_serializer.collect(*val);
+    }
+    codec_serializer.serialize(&stats, Box::new(data.iter().copied()).as_mut(), &mut bytes);
+
+    Codec::load(OwnedBytes::new(bytes)).unwrap()
+}
+fn bench_get<Codec: ColumnCodec>(b: &mut Bencher, data: &[u64]) {
+    let col = get_reader_for_bench::<Codec>(data);
+    b.iter(|| {
+        let mut sum = 0u64;
+        for pos in value_iter() {
+            let val = col.get_val(pos as u32);
+            sum = sum.wrapping_add(val);
+        }
+        sum
+    });
+}
+
+#[inline(never)]
+fn bench_get_dynamic_helper(b: &mut Bencher, col: Arc<dyn ColumnValues>) {
+    b.iter(|| {
+        let mut sum = 0u64;
+        for pos in value_iter() {
+            let val = col.get_val(pos as u32);
+            sum = sum.wrapping_add(val);
+        }
+        sum
+    });
+}
+
+fn bench_get_dynamic<Codec: ColumnCodec>(b: &mut Bencher, data: &[u64]) {
+    let col = Arc::new(get_reader_for_bench::<Codec>(data));
+    bench_get_dynamic_helper(b, col);
+}
+fn bench_create<Codec: ColumnCodec>(b: &mut Bencher, data: &[u64]) {
+    let stats = compute_stats(data.iter().cloned());
+
+    let mut bytes = Vec::new();
+    b.iter(|| {
+        bytes.clear();
+        let mut codec_serializer = Codec::estimator();
+        for val in data.iter().take(1024) {
+            codec_serializer.collect(*val);
+        }
+
+        codec_serializer.serialize(&stats, Box::new(data.iter().copied()).as_mut(), &mut bytes)
+    });
+}
+
+#[bench]
+fn bench_fastfield_bitpack_create(b: &mut Bencher) {
+    let data: Vec<_> = get_data();
+    bench_create::<BitpackedCodec>(b, &data);
+}
+#[bench]
+fn bench_fastfield_linearinterpol_create(b: &mut Bencher) {
+    let data: Vec<_> = get_data();
+    bench_create::<LinearCodec>(b, &data);
+}
+#[bench]
+fn bench_fastfield_multilinearinterpol_create(b: &mut Bencher) {
+    let data: Vec<_> = get_data();
+    bench_create::<BlockwiseLinearCodec>(b, &data);
+}
+#[bench]
+fn bench_fastfield_bitpack_get(b: &mut Bencher) {
+    let data: Vec<_> = get_data();
+    bench_get::<BitpackedCodec>(b, &data);
+}
+#[bench]
+fn bench_fastfield_bitpack_get_dynamic(b: &mut Bencher) {
+    let data: Vec<_> = get_data();
+    bench_get_dynamic::<BitpackedCodec>(b, &data);
+}
+#[bench]
+fn bench_fastfield_linearinterpol_get(b: &mut Bencher) {
+    let data: Vec<_> = get_data();
+    bench_get::<LinearCodec>(b, &data);
+}
+#[bench]
+fn bench_fastfield_linearinterpol_get_dynamic(b: &mut Bencher) {
+    let data: Vec<_> = get_data();
+    bench_get_dynamic::<LinearCodec>(b, &data);
+}
+#[bench]
+fn bench_fastfield_multilinearinterpol_get(b: &mut Bencher) {
+    let data: Vec<_> = get_data();
+    bench_get::<BlockwiseLinearCodec>(b, &data);
+}
+#[bench]
+fn bench_fastfield_multilinearinterpol_get_dynamic(b: &mut Bencher) {
+    let data: Vec<_> = get_data();
+    bench_get_dynamic::<BlockwiseLinearCodec>(b, &data);
+}
--- a/columnar/src/column_values/bitpacked.rs
+++ b/columnar/src/column_values/bitpacked.rs
@@ -1,115 +0,0 @@
-use std::io::{self, Write};
-
-use common::OwnedBytes;
-use tantivy_bitpacker::{compute_num_bits, BitPacker, BitUnpacker};
-
-use super::serialize::NormalizedHeader;
-use super::{ColumnValues, FastFieldCodec, FastFieldCodecType};
-
-/// Depending on the field type, a different
-/// fast field is required.
-#[derive(Clone)]
-pub struct BitpackedReader {
-    data: OwnedBytes,
-    bit_unpacker: BitUnpacker,
-    normalized_header: NormalizedHeader,
-}
-
-impl ColumnValues for BitpackedReader {
-    #[inline]
-    fn get_val(&self, doc: u32) -> u64 {
-        self.bit_unpacker.get(doc, &self.data)
-    }
-    #[inline]
-    fn min_value(&self) -> u64 {
-        // The BitpackedReader assumes a normalized vector.
-        0
-    }
-    #[inline]
-    fn max_value(&self) -> u64 {
-        self.normalized_header.max_value
-    }
-    #[inline]
-    fn num_vals(&self) -> u32 {
-        self.normalized_header.num_vals
-    }
-}
-
-pub struct BitpackedCodec;
-
-impl FastFieldCodec for BitpackedCodec {
-    /// The CODEC_TYPE is an enum value used for serialization.
-    const CODEC_TYPE: FastFieldCodecType = FastFieldCodecType::Bitpacked;
-
-    type Reader = BitpackedReader;
-
-    /// Opens a fast field given a file.
-    fn open_from_bytes(
-        data: OwnedBytes,
-        normalized_header: NormalizedHeader,
-    ) -> io::Result<Self::Reader> {
-        let num_bits = compute_num_bits(normalized_header.max_value);
-        let bit_unpacker = BitUnpacker::new(num_bits);
-        Ok(BitpackedReader {
-            data,
-            bit_unpacker,
-            normalized_header,
-        })
-    }
-
-    /// Serializes data with the BitpackedFastFieldSerializer.
-    ///
-    /// The bitpacker assumes that the column has been normalized.
-    /// i.e. It has already been shifted by its minimum value, so that its
-    /// current minimum value is 0.
-    ///
-    /// Ideally, we made a shift upstream on the column so that `col.min_value() == 0`.
-    fn serialize(column: &dyn ColumnValues, write: &mut impl Write) -> io::Result<()> {
-        assert_eq!(column.min_value(), 0u64);
-        let num_bits = compute_num_bits(column.max_value());
-        let mut bit_packer = BitPacker::new();
-        for val in column.iter() {
-            bit_packer.write(val, num_bits, write)?;
-        }
-        bit_packer.close(write)?;
-        Ok(())
-    }
-
-    fn estimate(column: &dyn ColumnValues) -> Option<f32> {
-        let num_bits = compute_num_bits(column.max_value());
-        let num_bits_uncompressed = 64;
-        Some(num_bits as f32 / num_bits_uncompressed as f32)
-    }
-}
-
-#[cfg(test)]
-mod tests {
-    use super::*;
-    use crate::column_values::tests::create_and_validate;
-
-    fn create_and_validate_bitpacked_codec(data: &[u64], name: &str) {
-        create_and_validate::<BitpackedCodec>(data, name);
-    }
-
-    #[test]
-    fn test_with_codec_data_sets() {
-        let data_sets = crate::column_values::tests::get_codec_test_datasets();
-        for (mut data, name) in data_sets {
-            create_and_validate_bitpacked_codec(&data, name);
-            data.reverse();
-            create_and_validate::<BitpackedCodec>(&data, name);
-        }
-    }
-
-    #[test]
-    fn bitpacked_fast_field_rand() {
-        for _ in 0..500 {
-            let mut data = (0..1 + rand::random::<u8>() as usize)
-                .map(|_| rand::random::<i64>() as u64 / 2)
-                .collect::<Vec<_>>();
-            create_and_validate_bitpacked_codec(&data, "rand");
-            data.reverse();
-            create_and_validate::<BitpackedCodec>(&data, "rand");
-        }
-    }
-}
--- a/columnar/src/column_values/blockwise_linear.rs
+++ b/columnar/src/column_values/blockwise_linear.rs
@@ -1,188 +0,0 @@
-use std::sync::Arc;
-use std::{io, iter};
-
-use common::{BinarySerializable, CountingWriter, DeserializeFrom, OwnedBytes};
-use tantivy_bitpacker::{compute_num_bits, BitPacker, BitUnpacker};
-
-use crate::column_values::line::Line;
-use crate::column_values::serialize::NormalizedHeader;
-use crate::column_values::{ColumnValues, FastFieldCodec, FastFieldCodecType, VecColumn};
-
-const CHUNK_SIZE: usize = 512;
-
-#[derive(Debug, Default)]
-struct Block {
-    line: Line,
-    bit_unpacker: BitUnpacker,
-    data_start_offset: usize,
-}
-
-impl BinarySerializable for Block {
-    fn serialize<W: io::Write>(&self, writer: &mut W) -> io::Result<()> {
-        self.line.serialize(writer)?;
-        self.bit_unpacker.bit_width().serialize(writer)?;
-        Ok(())
-    }
-
-    fn deserialize<R: io::Read>(reader: &mut R) -> io::Result<Self> {
-        let line = Line::deserialize(reader)?;
-        let bit_width = u8::deserialize(reader)?;
-        Ok(Block {
-            line,
-            bit_unpacker: BitUnpacker::new(bit_width),
-            data_start_offset: 0,
-        })
-    }
-}
-
-fn compute_num_blocks(num_vals: u32) -> usize {
-    (num_vals as usize + CHUNK_SIZE - 1) / CHUNK_SIZE
-}
-
-pub struct BlockwiseLinearCodec;
-
-impl FastFieldCodec for BlockwiseLinearCodec {
-    const CODEC_TYPE: FastFieldCodecType = FastFieldCodecType::BlockwiseLinear;
-    type Reader = BlockwiseLinearReader;
-
-    fn open_from_bytes(
-        bytes: common::OwnedBytes,
-        normalized_header: NormalizedHeader,
-    ) -> io::Result<Self::Reader> {
-        let footer_len: u32 = (&bytes[bytes.len() - 4..]).deserialize()?;
-        let footer_offset = bytes.len() - 4 - footer_len as usize;
-        let (data, mut footer) = bytes.split(footer_offset);
-        let num_blocks = compute_num_blocks(normalized_header.num_vals);
-        let mut blocks: Vec<Block> = iter::repeat_with(|| Block::deserialize(&mut footer))
-            .take(num_blocks)
-            .collect::<io::Result<_>>()?;
-
-        let mut start_offset = 0;
-        for block in &mut blocks {
-            block.data_start_offset = start_offset;
-            start_offset += (block.bit_unpacker.bit_width() as usize) * CHUNK_SIZE / 8;
-        }
-        Ok(BlockwiseLinearReader {
-            blocks: Arc::new(blocks),
-            data,
-            normalized_header,
-        })
-    }
-
-    // Estimate first_chunk and extrapolate
-    fn estimate(column: &dyn ColumnValues) -> Option<f32> {
-        if column.num_vals() < 10 * CHUNK_SIZE as u32 {
-            return None;
-        }
-        let mut first_chunk: Vec<u64> = column.iter().take(CHUNK_SIZE).collect();
-        let line = Line::train(&VecColumn::from(&first_chunk));
-        for (i, buffer_val) in first_chunk.iter_mut().enumerate() {
-            let interpolated_val = line.eval(i as u32);
-            *buffer_val = buffer_val.wrapping_sub(interpolated_val);
-        }
-        let estimated_bit_width = first_chunk
-            .iter()
-            .map(|el| ((el + 1) as f32 * 3.0) as u64)
-            .map(compute_num_bits)
-            .max()
-            .unwrap();
-
-        let metadata_per_block = {
-            let mut out = vec![];
-            Block::default().serialize(&mut out).unwrap();
-            out.len()
-        };
-        let num_bits = estimated_bit_width as u64 * column.num_vals() as u64
-            // function metadata per block
-            + metadata_per_block as u64 * (column.num_vals() as u64 / CHUNK_SIZE as u64);
-        let num_bits_uncompressed = 64 * column.num_vals();
-        Some(num_bits as f32 / num_bits_uncompressed as f32)
-    }
-
-    fn serialize(column: &dyn ColumnValues, wrt: &mut impl io::Write) -> io::Result<()> {
-        // The BitpackedReader assumes a normalized vector.
-        assert_eq!(column.min_value(), 0);
-        let mut buffer = Vec::with_capacity(CHUNK_SIZE);
-        let num_vals = column.num_vals();
-
-        let num_blocks = compute_num_blocks(num_vals);
-        let mut blocks = Vec::with_capacity(num_blocks);
-
-        let mut vals = column.iter();
-
-        let mut bit_packer = BitPacker::new();
-
-        for _ in 0..num_blocks {
-            buffer.clear();
-            buffer.extend((&mut vals).take(CHUNK_SIZE));
-            let line = Line::train(&VecColumn::from(&buffer));
-
-            assert!(!buffer.is_empty());
-
-            for (i, buffer_val) in buffer.iter_mut().enumerate() {
-                let interpolated_val = line.eval(i as u32);
-                *buffer_val = buffer_val.wrapping_sub(interpolated_val);
-            }
-            let bit_width = buffer.iter().copied().map(compute_num_bits).max().unwrap();
-
-            for &buffer_val in &buffer {
-                bit_packer.write(buffer_val, bit_width, wrt)?;
-            }
-
-            blocks.push(Block {
-                line,
-                bit_unpacker: BitUnpacker::new(bit_width),
-                data_start_offset: 0,
-            });
-        }
-
-        bit_packer.close(wrt)?;
-
-        assert_eq!(blocks.len(), compute_num_blocks(num_vals));
-
-        let mut counting_wrt = CountingWriter::wrap(wrt);
-        for block in &blocks {
-            block.serialize(&mut counting_wrt)?;
-        }
-        let footer_len = counting_wrt.written_bytes();
-        (footer_len as u32).serialize(&mut counting_wrt)?;
-
-        Ok(())
-    }
-}
-
-#[derive(Clone)]
-pub struct BlockwiseLinearReader {
-    blocks: Arc<Vec<Block>>,
-    normalized_header: NormalizedHeader,
-    data: OwnedBytes,
-}
-
-impl ColumnValues for BlockwiseLinearReader {
-    #[inline(always)]
-    fn get_val(&self, idx: u32) -> u64 {
-        let block_id = (idx / CHUNK_SIZE as u32) as usize;
-        let idx_within_block = idx % (CHUNK_SIZE as u32);
-        let block = &self.blocks[block_id];
-        let interpoled_val: u64 = block.line.eval(idx_within_block);
-        let block_bytes = &self.data[block.data_start_offset..];
-        let bitpacked_diff = block.bit_unpacker.get(idx_within_block, block_bytes);
-        interpoled_val.wrapping_add(bitpacked_diff)
-    }
-
-    #[inline(always)]
-    fn min_value(&self) -> u64 {
-        // The BlockwiseLinearReader assumes a normalized vector.
-        0u64
-    }
-
-    #[inline(always)]
-    fn max_value(&self) -> u64 {
-        self.normalized_header.max_value
-    }
-
-    #[inline(always)]
-    fn num_vals(&self) -> u32 {
-        self.normalized_header.num_vals
-    }
-}
--- a/columnar/src/column_values/column.rs
+++ b/columnar/src/column_values/column.rs
@@ -1,376 +0,0 @@
-use std::marker::PhantomData;
-use std::ops::{Range, RangeInclusive};
-
-use tantivy_bitpacker::minmax;
-
-use crate::column_values::monotonic_mapping::StrictlyMonotonicFn;
-
-/// `ColumnValues` provides access to a dense field column.
-///
-/// `Column` are just a wrapper over `ColumnValues` and a `ColumnIndex`.
-pub trait ColumnValues<T: PartialOrd = u64>: Send + Sync {
-    /// Return the value associated with the given idx.
-    ///
-    /// This accessor should return as fast as possible.
-    ///
-    /// # Panics
-    ///
-    /// May panic if `idx` is greater than the column length.
-    fn get_val(&self, idx: u32) -> T;
-
-    /// Fills an output buffer with the fast field values
-    /// associated with the `DocId` going from
-    /// `start` to `start + output.len()`.
-    ///
-    /// # Panics
-    ///
-    /// Must panic if `start + output.len()` is greater than
-    /// the segment's `maxdoc`.
-    #[inline]
-    fn get_range(&self, start: u64, output: &mut [T]) {
-        for (out, idx) in output.iter_mut().zip(start..) {
-            *out = self.get_val(idx as u32);
-        }
-    }
-
-    /// Get the positions of values which are in the provided value range.
-    ///
-    /// Note that position == docid for single value fast fields
-    #[inline]
-    fn get_docids_for_value_range(
-        &self,
-        value_range: RangeInclusive<T>,
-        doc_id_range: Range<u32>,
-        positions: &mut Vec<u32>,
-    ) {
-        let doc_id_range = doc_id_range.start..doc_id_range.end.min(self.num_vals());
-
-        for idx in doc_id_range.start..doc_id_range.end {
-            let val = self.get_val(idx);
-            if value_range.contains(&val) {
-                positions.push(idx);
-            }
-        }
-    }
-
-    /// Returns the minimum value for this fast field.
-    ///
-    /// This min_value may not be exact.
-    /// For instance, the min value does not take in account of possible
-    /// deleted document. All values are however guaranteed to be higher than
-    /// `.min_value()`.
-    fn min_value(&self) -> T;
-
-    /// Returns the maximum value for this fast field.
-    ///
-    /// This max_value may not be exact.
-    /// For instance, the max value does not take in account of possible
-    /// deleted document. All values are however guaranteed to be higher than
-    /// `.max_value()`.
-    fn max_value(&self) -> T;
-
-    /// The number of values in the column.
-    fn num_vals(&self) -> u32;
-
-    /// Returns a iterator over the data
-    fn iter<'a>(&'a self) -> Box<dyn Iterator<Item = T> + 'a> {
-        Box::new((0..self.num_vals()).map(|idx| self.get_val(idx)))
-    }
-}
-
-impl<T: Copy + PartialOrd> ColumnValues<T> for std::sync::Arc<dyn ColumnValues<T>> {
-    fn get_val(&self, idx: u32) -> T {
-        self.as_ref().get_val(idx)
-    }
-
-    fn min_value(&self) -> T {
-        self.as_ref().min_value()
-    }
-
-    fn max_value(&self) -> T {
-        self.as_ref().max_value()
-    }
-
-    fn num_vals(&self) -> u32 {
-        self.as_ref().num_vals()
-    }
-
-    fn iter<'b>(&'b self) -> Box<dyn Iterator<Item = T> + 'b> {
-        self.as_ref().iter()
-    }
-
-    fn get_range(&self, start: u64, output: &mut [T]) {
-        self.as_ref().get_range(start, output)
-    }
-}
-
-impl<'a, C: ColumnValues<T> + ?Sized, T: Copy + PartialOrd> ColumnValues<T> for &'a C {
-    fn get_val(&self, idx: u32) -> T {
-        (*self).get_val(idx)
-    }
-
-    fn min_value(&self) -> T {
-        (*self).min_value()
-    }
-
-    fn max_value(&self) -> T {
-        (*self).max_value()
-    }
-
-    fn num_vals(&self) -> u32 {
-        (*self).num_vals()
-    }
-
-    fn iter<'b>(&'b self) -> Box<dyn Iterator<Item = T> + 'b> {
-        (*self).iter()
-    }
-
-    fn get_range(&self, start: u64, output: &mut [T]) {
-        (*self).get_range(start, output)
-    }
-}
-
-/// VecColumn provides `Column` over a slice.
-pub struct VecColumn<'a, T = u64> {
-    pub(crate) values: &'a [T],
-    pub(crate) min_value: T,
-    pub(crate) max_value: T,
-}
-
-impl<'a, T: Copy + PartialOrd + Send + Sync> ColumnValues<T> for VecColumn<'a, T> {
-    fn get_val(&self, position: u32) -> T {
-        self.values[position as usize]
-    }
-
-    fn iter(&self) -> Box<dyn Iterator<Item = T> + '_> {
-        Box::new(self.values.iter().copied())
-    }
-
-    fn min_value(&self) -> T {
-        self.min_value
-    }
-
-    fn max_value(&self) -> T {
-        self.max_value
-    }
-
-    fn num_vals(&self) -> u32 {
-        self.values.len() as u32
-    }
-
-    fn get_range(&self, start: u64, output: &mut [T]) {
-        output.copy_from_slice(&self.values[start as usize..][..output.len()])
-    }
-}
-
-impl<'a, T: Copy + PartialOrd + Default, V> From<&'a V> for VecColumn<'a, T>
-where V: AsRef<[T]> + ?Sized
-{
-    fn from(values: &'a V) -> Self {
-        let values = values.as_ref();
-        let (min_value, max_value) = minmax(values.iter().copied()).unwrap_or_default();
-        Self {
-            values,
-            min_value,
-            max_value,
-        }
-    }
-}
-
-struct MonotonicMappingColumn<C, T, Input> {
-    from_column: C,
-    monotonic_mapping: T,
-    _phantom: PhantomData<Input>,
-}
-
-/// Creates a view of a column transformed by a strictly monotonic mapping. See
-/// [`StrictlyMonotonicFn`].
-///
-/// E.g. apply a gcd monotonic_mapping([100, 200, 300]) == [1, 2, 3]
-/// monotonic_mapping.mapping() is expected to be injective, and we should always have
-/// monotonic_mapping.inverse(monotonic_mapping.mapping(el)) == el
-///
-/// The inverse of the mapping is required for:
-/// `fn get_positions_for_value_range(&self, range: RangeInclusive<T>) -> Vec<u64> `
-/// The user provides the original value range and we need to monotonic map them in the same way the
-/// serialization does before calling the underlying column.
-///
-/// Note that when opening a codec, the monotonic_mapping should be the inverse of the mapping
-/// during serialization. And therefore the monotonic_mapping_inv when opening is the same as
-/// monotonic_mapping during serialization.
-pub fn monotonic_map_column<C, T, Input, Output>(
-    from_column: C,
-    monotonic_mapping: T,
-) -> impl ColumnValues<Output>
-where
-    C: ColumnValues<Input>,
-    T: StrictlyMonotonicFn<Input, Output> + Send + Sync,
-    Input: PartialOrd + Send + Sync + Clone,
-    Output: PartialOrd + Send + Sync + Clone,
-{
-    MonotonicMappingColumn {
-        from_column,
-        monotonic_mapping,
-        _phantom: PhantomData,
-    }
-}
-
-impl<C, T, Input, Output> ColumnValues<Output> for MonotonicMappingColumn<C, T, Input>
-where
-    C: ColumnValues<Input>,
-    T: StrictlyMonotonicFn<Input, Output> + Send + Sync,
-    Input: PartialOrd + Send + Sync + Clone,
-    Output: PartialOrd + Send + Sync + Clone,
-{
-    #[inline]
-    fn get_val(&self, idx: u32) -> Output {
-        let from_val = self.from_column.get_val(idx);
-        self.monotonic_mapping.mapping(from_val)
-    }
-
-    fn min_value(&self) -> Output {
-        let from_min_value = self.from_column.min_value();
-        self.monotonic_mapping.mapping(from_min_value)
-    }
-
-    fn max_value(&self) -> Output {
-        let from_max_value = self.from_column.max_value();
-        self.monotonic_mapping.mapping(from_max_value)
-    }
-
-    fn num_vals(&self) -> u32 {
-        self.from_column.num_vals()
-    }
-
-    fn iter(&self) -> Box<dyn Iterator<Item = Output> + '_> {
-        Box::new(
-            self.from_column
-                .iter()
-                .map(|el| self.monotonic_mapping.mapping(el)),
-        )
-    }
-
-    fn get_docids_for_value_range(
-        &self,
-        range: RangeInclusive<Output>,
-        doc_id_range: Range<u32>,
-        positions: &mut Vec<u32>,
-    ) {
-        self.from_column.get_docids_for_value_range(
-            self.monotonic_mapping.inverse(range.start().clone())
-                ..=self.monotonic_mapping.inverse(range.end().clone()),
-            doc_id_range,
-            positions,
-        )
-    }
-
-    // We voluntarily do not implement get_range as it yields a regression,
-    // and we do not have any specialized implementation anyway.
-}
-
-/// Wraps an iterator into a `Column`.
-pub struct IterColumn<T>(T);
-
-impl<T> From<T> for IterColumn<T>
-where T: Iterator + Clone + ExactSizeIterator
-{
-    fn from(iter: T) -> Self {
-        IterColumn(iter)
-    }
-}
-
-impl<T> ColumnValues<T::Item> for IterColumn<T>
-where
-    T: Iterator + Clone + ExactSizeIterator + Send + Sync,
-    T::Item: PartialOrd,
-{
-    fn get_val(&self, idx: u32) -> T::Item {
-        self.0.clone().nth(idx as usize).unwrap()
-    }
-
-    fn min_value(&self) -> T::Item {
-        self.0.clone().next().unwrap()
-    }
-
-    fn max_value(&self) -> T::Item {
-        self.0.clone().last().unwrap()
-    }
-
-    fn num_vals(&self) -> u32 {
-        self.0.len() as u32
-    }
-
-    fn iter(&self) -> Box<dyn Iterator<Item = T::Item> + '_> {
-        Box::new(self.0.clone())
-    }
-}
-
-#[cfg(test)]
-mod tests {
-    use super::*;
-    use crate::column_values::monotonic_mapping::{
-        StrictlyMonotonicMappingInverter, StrictlyMonotonicMappingToInternalBaseval,
-        StrictlyMonotonicMappingToInternalGCDBaseval,
-    };
-
-    #[test]
-    fn test_monotonic_mapping() {
-        let vals = &[3u64, 5u64][..];
-        let col = VecColumn::from(vals);
-        let mapped = monotonic_map_column(col, StrictlyMonotonicMappingToInternalBaseval::new(2));
-        assert_eq!(mapped.min_value(), 1u64);
-        assert_eq!(mapped.max_value(), 3u64);
-        assert_eq!(mapped.num_vals(), 2);
-        assert_eq!(mapped.num_vals(), 2);
-        assert_eq!(mapped.get_val(0), 1);
-        assert_eq!(mapped.get_val(1), 3);
-    }
-
-    #[test]
-    fn test_range_as_col() {
-        let col = IterColumn::from(10..100);
-        assert_eq!(col.num_vals(), 90);
-        assert_eq!(col.max_value(), 99);
-    }
-
-    #[test]
-    fn test_monotonic_mapping_iter() {
-        let vals: Vec<u64> = (10..110u64).map(|el| el * 10).collect();
-        let col = VecColumn::from(&vals);
-        let mapped = monotonic_map_column(
-            col,
-            StrictlyMonotonicMappingInverter::from(
-                StrictlyMonotonicMappingToInternalGCDBaseval::new(10, 100),
-            ),
-        );
-        let val_i64s: Vec<u64> = mapped.iter().collect();
-        for i in 0..100 {
-            assert_eq!(val_i64s[i as usize], mapped.get_val(i));
-        }
-    }
-
-    #[test]
-    fn test_monotonic_mapping_get_range() {
-        let vals: Vec<u64> = (0..100u64).map(|el| el * 10).collect();
-        let col = VecColumn::from(&vals);
-        let mapped = monotonic_map_column(
-            col,
-            StrictlyMonotonicMappingInverter::from(
-                StrictlyMonotonicMappingToInternalGCDBaseval::new(10, 0),
-            ),
-        );
-
-        assert_eq!(mapped.min_value(), 0u64);
-        assert_eq!(mapped.max_value(), 9900u64);
-        assert_eq!(mapped.num_vals(), 100);
-        let val_u64s: Vec<u64> = mapped.iter().collect();
-        assert_eq!(val_u64s.len(), 100);
-        for i in 0..100 {
-            assert_eq!(val_u64s[i as usize], mapped.get_val(i));
-            assert_eq!(val_u64s[i as usize], vals[i as usize] * 10);
-        }
-        let mut buf = [0u64; 20];
-        mapped.get_range(7, &mut buf[..]);
-        assert_eq!(&val_u64s[7..][..20], &buf);
-    }
-}
--- a/columnar/src/column_values/column_with_cardinality.rs
+++ b/columnar/src/column_values/column_with_cardinality.rs
@@ -1,19 +0,0 @@
-// Copyright (C) 2022 Quickwit, Inc.
-//
-// Quickwit is offered under the AGPL v3.0 and as commercial software.
-// For commercial licensing, contact us at hello@quickwit.io.
-//
-// AGPL:
-// This program is free software: you can redistribute it and/or modify
-// it under the terms of the GNU Affero General Public License as
-// published by the Free Software Foundation, either version 3 of the
-// License, or (at your option) any later version.
-//
-// This program is distributed in the hope that it will be useful,
-// but WITHOUT ANY WARRANTY; without even the implied warranty of
-// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
-// GNU Affero General Public License for more details.
-//
-// You should have received a copy of the GNU Affero General Public License
-// along with this program. If not, see <http://www.gnu.org/licenses/>.
-//
--- a/columnar/src/column_values/compact_space/build_compact_space.rs
+++ b/columnar/src/column_values/compact_space/build_compact_space.rs
@@ -1,231 +0,0 @@
-use std::collections::{BTreeSet, BinaryHeap};
-use std::iter;
-use std::ops::RangeInclusive;
-
-use itertools::Itertools;
-
-use super::blank_range::BlankRange;
-use super::{CompactSpace, RangeMapping};
-
-/// Put the blanks for the sorted values into a binary heap
-fn get_blanks(values_sorted: &BTreeSet<u128>) -> BinaryHeap<BlankRange> {
-    let mut blanks: BinaryHeap<BlankRange> = BinaryHeap::new();
-    for (first, second) in values_sorted.iter().tuple_windows() {
-        // Correctness Overflow: the values are deduped and sorted (BTreeSet property), that means
-        // there's always space between two values.
-        let blank_range = first + 1..=second - 1;
-        let blank_range: Result<BlankRange, _> = blank_range.try_into();
-        if let Ok(blank_range) = blank_range {
-            blanks.push(blank_range);
-        }
-    }
-
-    blanks
-}
-
-struct BlankCollector {
-    blanks: Vec<BlankRange>,
-    staged_blanks_sum: u128,
-}
-impl BlankCollector {
-    fn new() -> Self {
-        Self {
-            blanks: vec![],
-            staged_blanks_sum: 0,
-        }
-    }
-    fn stage_blank(&mut self, blank: BlankRange) {
-        self.staged_blanks_sum += blank.blank_size();
-        self.blanks.push(blank);
-    }
-    fn drain(&mut self) -> impl Iterator<Item = BlankRange> + '_ {
-        self.staged_blanks_sum = 0;
-        self.blanks.drain(..)
-    }
-    fn staged_blanks_sum(&self) -> u128 {
-        self.staged_blanks_sum
-    }
-    fn num_staged_blanks(&self) -> usize {
-        self.blanks.len()
-    }
-}
-fn num_bits(val: u128) -> u8 {
-    (128u32 - val.leading_zeros()) as u8
-}
-
-/// Will collect blanks and add them to compact space if more bits are saved than cost from
-/// metadata.
-pub fn get_compact_space(
-    values_deduped_sorted: &BTreeSet<u128>,
-    total_num_values: u32,
-    cost_per_blank: usize,
-) -> CompactSpace {
-    let mut compact_space_builder = CompactSpaceBuilder::new();
-    if values_deduped_sorted.is_empty() {
-        return compact_space_builder.finish();
-    }
-
-    let mut blanks: BinaryHeap<BlankRange> = get_blanks(values_deduped_sorted);
-    // Replace after stabilization of https://github.com/rust-lang/rust/issues/62924
-
-    // We start by space that's limited to min_value..=max_value
-    let min_value = *values_deduped_sorted.iter().next().unwrap_or(&0);
-    let max_value = *values_deduped_sorted.iter().last().unwrap_or(&0);
-
-    // +1 for null, in case min and max covers the whole space, we are off by one.
-    let mut amplitude_compact_space = (max_value - min_value).saturating_add(1);
-    if min_value != 0 {
-        compact_space_builder.add_blanks(iter::once(0..=min_value - 1));
-    }
-    if max_value != u128::MAX {
-        compact_space_builder.add_blanks(iter::once(max_value + 1..=u128::MAX));
-    }
-
-    let mut amplitude_bits: u8 = num_bits(amplitude_compact_space);
-
-    let mut blank_collector = BlankCollector::new();
-    // We will stage blanks until they reduce the compact space by at least 1 bit and then flush
-    // them if the metadata cost is lower than the total number of saved bits.
-    // Binary heap to process the gaps by their size
-    while let Some(blank_range) = blanks.pop() {
-        blank_collector.stage_blank(blank_range);
-
-        let staged_spaces_sum: u128 = blank_collector.staged_blanks_sum();
-        let amplitude_new_compact_space = amplitude_compact_space - staged_spaces_sum;
-        let amplitude_new_bits = num_bits(amplitude_new_compact_space);
-        if amplitude_bits == amplitude_new_bits {
-            continue;
-        }
-        let saved_bits = (amplitude_bits - amplitude_new_bits) as usize * total_num_values as usize;
-        // TODO: Maybe calculate exact cost of blanks and run this more expensive computation only,
-        // when amplitude_new_bits changes
-        let cost = blank_collector.num_staged_blanks() * cost_per_blank;
-        if cost >= saved_bits {
-            // Continue here, since although we walk over the blanks by size,
-            // we can potentially save a lot at the last bits, which are smaller blanks
-            //
-            // E.g. if the first range reduces the compact space by 1000 from 2000 to 1000, which
-            // saves 11-10=1 bit and the next range reduces the compact space by 950 to
-            // 50, which saves 10-6=4 bit
-            continue;
-        }
-
-        amplitude_compact_space = amplitude_new_compact_space;
-        amplitude_bits = amplitude_new_bits;
-        compact_space_builder.add_blanks(blank_collector.drain().map(|blank| blank.blank_range()));
-    }
-
-    // special case, when we don't collected any blanks because:
-    // * the data is empty (early exit)
-    // * the algorithm did decide it's not worth the cost, which can be the case for single values
-    //
-    // We drain one collected blank unconditionally, so the empty case is reserved for empty
-    // data, and therefore empty compact_space means the data is empty and no data is covered
-    // (conversely to all data) and we can assign null to it.
-    if compact_space_builder.is_empty() {
-        compact_space_builder.add_blanks(
-            blank_collector
-                .drain()
-                .map(|blank| blank.blank_range())
-                .take(1),
-        );
-    }
-
-    let compact_space = compact_space_builder.finish();
-    if max_value - min_value != u128::MAX {
-        debug_assert_eq!(
-            compact_space.amplitude_compact_space(),
-            amplitude_compact_space
-        );
-    }
-    compact_space
-}
-
-#[derive(Debug, Clone, Eq, PartialEq)]
-struct CompactSpaceBuilder {
-    blanks: Vec<RangeInclusive<u128>>,
-}
-
-impl CompactSpaceBuilder {
-    /// Creates a new compact space builder which will initially cover the whole space.
-    fn new() -> Self {
-        Self { blanks: Vec::new() }
-    }
-
-    /// Assumes that repeated add_blank calls don't overlap and are not adjacent,
-    /// e.g. [3..=5, 5..=10] is not allowed
-    ///
-    /// Both of those assumptions are true when blanks are produced from sorted values.
-    fn add_blanks(&mut self, blank: impl Iterator<Item = RangeInclusive<u128>>) {
-        self.blanks.extend(blank);
-    }
-
-    fn is_empty(&self) -> bool {
-        self.blanks.is_empty()
-    }
-
-    /// Convert blanks to covered space and assign null value
-    fn finish(mut self) -> CompactSpace {
-        // sort by start. ranges are not allowed to overlap
-        self.blanks.sort_unstable_by_key(|blank| *blank.start());
-
-        let mut covered_space = Vec::with_capacity(self.blanks.len());
-
-        // begining of the blanks
-        if let Some(first_blank_start) = self.blanks.first().map(RangeInclusive::start) {
-            if *first_blank_start != 0 {
-                covered_space.push(0..=first_blank_start - 1);
-            }
-        }
-
-        // Between the blanks
-        let between_blanks = self.blanks.iter().tuple_windows().map(|(left, right)| {
-            assert!(
-                left.end() < right.start(),
-                "overlapping or adjacent ranges detected"
-            );
-            *left.end() + 1..=*right.start() - 1
-        });
-        covered_space.extend(between_blanks);
-
-        // end of the blanks
-        if let Some(last_blank_end) = self.blanks.last().map(RangeInclusive::end) {
-            if *last_blank_end != u128::MAX {
-                covered_space.push(last_blank_end + 1..=u128::MAX);
-            }
-        }
-
-        if covered_space.is_empty() {
-            covered_space.push(0..=0); // empty data case
-        };
-
-        let mut compact_start: u64 = 1; // 0 is reserved for `null`
-        let mut ranges_mapping: Vec<RangeMapping> = Vec::with_capacity(covered_space.len());
-        for cov in covered_space {
-            let range_mapping = super::RangeMapping {
-                value_range: cov,
-                compact_start,
-            };
-            let covered_range_len = range_mapping.range_length();
-            ranges_mapping.push(range_mapping);
-            compact_start += covered_range_len;
-        }
-        // println!("num ranges {}", ranges_mapping.len());
-        CompactSpace { ranges_mapping }
-    }
-}
-
-#[cfg(test)]
-mod tests {
-    use super::*;
-
-    #[test]
-    fn test_binary_heap_pop_order() {
-        let mut blanks: BinaryHeap<BlankRange> = BinaryHeap::new();
-        blanks.push((0..=10).try_into().unwrap());
-        blanks.push((100..=200).try_into().unwrap());
-        blanks.push((100..=110).try_into().unwrap());
-        assert_eq!(blanks.pop().unwrap().blank_size(), 101);
-        assert_eq!(blanks.pop().unwrap().blank_size(), 11);
-    }
-}
--- a/columnar/src/column_values/compact_space/mod.rs
+++ b/columnar/src/column_values/compact_space/mod.rs
@@ -1,813 +0,0 @@
-/// This codec takes a large number space (u128) and reduces it to a compact number space.
-///
-/// It will find spaces in the number range. For example:
-///
-/// 100, 101, 102, 103, 104, 50000, 50001
-/// could be mapped to
-/// 100..104 -> 0..4
-/// 50000..50001 -> 5..6
-///
-/// Compact space 0..=6 requires much less bits than 100..=50001
-///
-/// The codec is created to compress ip addresses, but may be employed in other use cases.
-use std::{
-    cmp::Ordering,
-    collections::BTreeSet,
-    io::{self, Write},
-    ops::{Range, RangeInclusive},
-};
-
-use common::{BinarySerializable, CountingWriter, OwnedBytes, VInt, VIntU128};
-use tantivy_bitpacker::{self, BitPacker, BitUnpacker};
-
-use crate::column_values::compact_space::build_compact_space::get_compact_space;
-use crate::column_values::ColumnValues;
-
-mod blank_range;
-mod build_compact_space;
-
-/// The cost per blank is quite hard actually, since blanks are delta encoded, the actual cost of
-/// blanks depends on the number of blanks.
-///
-/// The number is taken by looking at a real dataset. It is optimized for larger datasets.
-const COST_PER_BLANK_IN_BITS: usize = 36;
-
-#[derive(Debug, Clone, Eq, PartialEq)]
-pub struct CompactSpace {
-    ranges_mapping: Vec<RangeMapping>,
-}
-
-/// Maps the range from the original space to compact_start + range.len()
-#[derive(Debug, Clone, Eq, PartialEq)]
-struct RangeMapping {
-    value_range: RangeInclusive<u128>,
-    compact_start: u64,
-}
-impl RangeMapping {
-    fn range_length(&self) -> u64 {
-        (self.value_range.end() - self.value_range.start()) as u64 + 1
-    }
-
-    // The last value of the compact space in this range
-    fn compact_end(&self) -> u64 {
-        self.compact_start + self.range_length() - 1
-    }
-}
-
-impl BinarySerializable for CompactSpace {
-    fn serialize<W: io::Write>(&self, writer: &mut W) -> io::Result<()> {
-        VInt(self.ranges_mapping.len() as u64).serialize(writer)?;
-
-        let mut prev_value = 0;
-        for value_range in self
-            .ranges_mapping
-            .iter()
-            .map(|range_mapping| &range_mapping.value_range)
-        {
-            let blank_delta_start = value_range.start() - prev_value;
-            VIntU128(blank_delta_start).serialize(writer)?;
-            prev_value = *value_range.start();
-
-            let blank_delta_end = value_range.end() - prev_value;
-            VIntU128(blank_delta_end).serialize(writer)?;
-            prev_value = *value_range.end();
-        }
-
-        Ok(())
-    }
-
-    fn deserialize<R: io::Read>(reader: &mut R) -> io::Result<Self> {
-        let num_ranges = VInt::deserialize(reader)?.0;
-        let mut ranges_mapping: Vec<RangeMapping> = vec![];
-        let mut value = 0u128;
-        let mut compact_start = 1u64; // 0 is reserved for `null`
-        for _ in 0..num_ranges {
-            let blank_delta_start = VIntU128::deserialize(reader)?.0;
-            value += blank_delta_start;
-            let blank_start = value;
-
-            let blank_delta_end = VIntU128::deserialize(reader)?.0;
-            value += blank_delta_end;
-            let blank_end = value;
-
-            let range_mapping = RangeMapping {
-                value_range: blank_start..=blank_end,
-                compact_start,
-            };
-            let range_length = range_mapping.range_length();
-            ranges_mapping.push(range_mapping);
-            compact_start += range_length;
-        }
-
-        Ok(Self { ranges_mapping })
-    }
-}
-
-impl CompactSpace {
-    /// Amplitude is the value range of the compact space including the sentinel value used to
-    /// identify null values. The compact space is 0..=amplitude .
-    ///
-    /// It's only used to verify we don't exceed u64 number space, which would indicate a bug.
-    fn amplitude_compact_space(&self) -> u128 {
-        self.ranges_mapping
-            .last()
-            .map(|last_range| last_range.compact_end() as u128)
-            .unwrap_or(1) // compact space starts at 1, 0 == null
-    }
-
-    fn get_range_mapping(&self, pos: usize) -> &RangeMapping {
-        &self.ranges_mapping[pos]
-    }
-
-    /// Returns either Ok(the value in the compact space) or if it is outside the compact space the
-    /// Err(position where it would be inserted)
-    fn u128_to_compact(&self, value: u128) -> Result<u64, usize> {
-        self.ranges_mapping
-            .binary_search_by(|probe| {
-                let value_range = &probe.value_range;
-                if value < *value_range.start() {
-                    Ordering::Greater
-                } else if value > *value_range.end() {
-                    Ordering::Less
-                } else {
-                    Ordering::Equal
-                }
-            })
-            .map(|pos| {
-                let range_mapping = &self.ranges_mapping[pos];
-                let pos_in_range = (value - range_mapping.value_range.start()) as u64;
-                range_mapping.compact_start + pos_in_range
-            })
-    }
-
-    /// Unpacks a value from compact space u64 to u128 space
-    fn compact_to_u128(&self, compact: u64) -> u128 {
-        let pos = self
-            .ranges_mapping
-            .binary_search_by_key(&compact, |range_mapping| range_mapping.compact_start)
-            // Correctness: Overflow. The first range starts at compact space 0, the error from
-            // binary search can never be 0
-            .map_or_else(|e| e - 1, |v| v);
-
-        let range_mapping = &self.ranges_mapping[pos];
-        let diff = compact - range_mapping.compact_start;
-        range_mapping.value_range.start() + diff as u128
-    }
-}
-
-pub struct CompactSpaceCompressor {
-    params: IPCodecParams,
-}
-#[derive(Debug, Clone)]
-pub struct IPCodecParams {
-    compact_space: CompactSpace,
-    bit_unpacker: BitUnpacker,
-    min_value: u128,
-    max_value: u128,
-    num_vals: u32,
-    num_bits: u8,
-}
-
-impl CompactSpaceCompressor {
-    /// Taking the vals as Vec may cost a lot of memory. It is used to sort the vals.
-    pub fn train_from(iter: impl Iterator<Item = u128>, num_vals: u32) -> Self {
-        let mut values_sorted = BTreeSet::new();
-        values_sorted.extend(iter);
-        let total_num_values = num_vals;
-
-        let compact_space =
-            get_compact_space(&values_sorted, total_num_values, COST_PER_BLANK_IN_BITS);
-        let amplitude_compact_space = compact_space.amplitude_compact_space();
-
-        assert!(
-            amplitude_compact_space <= u64::MAX as u128,
-            "case unsupported."
-        );
-
-        let num_bits = tantivy_bitpacker::compute_num_bits(amplitude_compact_space as u64);
-        let min_value = *values_sorted.iter().next().unwrap_or(&0);
-        let max_value = *values_sorted.iter().last().unwrap_or(&0);
-        assert_eq!(
-            compact_space
-                .u128_to_compact(max_value)
-                .expect("could not convert max value to compact space"),
-            amplitude_compact_space as u64
-        );
-        CompactSpaceCompressor {
-            params: IPCodecParams {
-                compact_space,
-                bit_unpacker: BitUnpacker::new(num_bits),
-                min_value,
-                max_value,
-                num_vals: total_num_values,
-                num_bits,
-            },
-        }
-    }
-
-    fn write_footer(self, writer: &mut impl Write) -> io::Result<()> {
-        let writer = &mut CountingWriter::wrap(writer);
-        self.params.serialize(writer)?;
-
-        let footer_len = writer.written_bytes() as u32;
-        footer_len.serialize(writer)?;
-
-        Ok(())
-    }
-
-    pub fn compress_into(
-        self,
-        vals: impl Iterator<Item = u128>,
-        write: &mut impl Write,
-    ) -> io::Result<()> {
-        let mut bitpacker = BitPacker::default();
-        for val in vals {
-            let compact = self
-                .params
-                .compact_space
-                .u128_to_compact(val)
-                .map_err(|_| {
-                    io::Error::new(
-                        io::ErrorKind::InvalidData,
-                        "Could not convert value to compact_space. This is a bug.",
-                    )
-                })?;
-            bitpacker.write(compact, self.params.num_bits, write)?;
-        }
-        bitpacker.close(write)?;
-        self.write_footer(write)?;
-        Ok(())
-    }
-}
-
-#[derive(Debug, Clone)]
-pub struct CompactSpaceDecompressor {
-    data: OwnedBytes,
-    params: IPCodecParams,
-}
-
-impl BinarySerializable for IPCodecParams {
-    fn serialize<W: io::Write>(&self, writer: &mut W) -> io::Result<()> {
-        // header flags for future optional dictionary encoding
-        let footer_flags = 0u64;
-        footer_flags.serialize(writer)?;
-
-        VIntU128(self.min_value).serialize(writer)?;
-        VIntU128(self.max_value).serialize(writer)?;
-        VIntU128(self.num_vals as u128).serialize(writer)?;
-        self.num_bits.serialize(writer)?;
-
-        self.compact_space.serialize(writer)?;
-
-        Ok(())
-    }
-
-    fn deserialize<R: io::Read>(reader: &mut R) -> io::Result<Self> {
-        let _header_flags = u64::deserialize(reader)?;
-        let min_value = VIntU128::deserialize(reader)?.0;
-        let max_value = VIntU128::deserialize(reader)?.0;
-        let num_vals = VIntU128::deserialize(reader)?.0 as u32;
-        let num_bits = u8::deserialize(reader)?;
-        let compact_space = CompactSpace::deserialize(reader)?;
-
-        Ok(Self {
-            compact_space,
-            bit_unpacker: BitUnpacker::new(num_bits),
-            min_value,
-            max_value,
-            num_vals,
-            num_bits,
-        })
-    }
-}
-
-impl ColumnValues<u128> for CompactSpaceDecompressor {
-    #[inline]
-    fn get_val(&self, doc: u32) -> u128 {
-        self.get(doc)
-    }
-
-    fn min_value(&self) -> u128 {
-        self.min_value()
-    }
-
-    fn max_value(&self) -> u128 {
-        self.max_value()
-    }
-
-    fn num_vals(&self) -> u32 {
-        self.params.num_vals
-    }
-
-    #[inline]
-    fn iter(&self) -> Box<dyn Iterator<Item = u128> + '_> {
-        Box::new(self.iter())
-    }
-
-    #[inline]
-    fn get_docids_for_value_range(
-        &self,
-        value_range: RangeInclusive<u128>,
-        positions_range: Range<u32>,
-        positions: &mut Vec<u32>,
-    ) {
-        self.get_positions_for_value_range(value_range, positions_range, positions)
-    }
-}
-
-impl CompactSpaceDecompressor {
-    pub fn open(data: OwnedBytes) -> io::Result<CompactSpaceDecompressor> {
-        let (data_slice, footer_len_bytes) = data.split_at(data.len() - 4);
-        let footer_len = u32::deserialize(&mut &footer_len_bytes[..])?;
-
-        let data_footer = &data_slice[data_slice.len() - footer_len as usize..];
-        let params = IPCodecParams::deserialize(&mut &data_footer[..])?;
-        let decompressor = CompactSpaceDecompressor { data, params };
-
-        Ok(decompressor)
-    }
-
-    /// Converting to compact space for the decompressor is more complex, since we may get values
-    /// which are outside the compact space. e.g. if we map
-    /// 1000 => 5
-    /// 2000 => 6
-    ///
-    /// and we want a mapping for 1005, there is no equivalent compact space. We instead return an
-    /// error with the index of the next range.
-    fn u128_to_compact(&self, value: u128) -> Result<u64, usize> {
-        self.params.compact_space.u128_to_compact(value)
-    }
-
-    fn compact_to_u128(&self, compact: u64) -> u128 {
-        self.params.compact_space.compact_to_u128(compact)
-    }
-
-    /// Comparing on compact space: Random dataset 0,24 (50% random hit) - 1.05 GElements/s
-    /// Comparing on compact space: Real dataset 1.08 GElements/s
-    ///
-    /// Comparing on original space: Real dataset .06 GElements/s (not completely optimized)
-    #[inline]
-    pub fn get_positions_for_value_range(
-        &self,
-        value_range: RangeInclusive<u128>,
-        position_range: Range<u32>,
-        positions: &mut Vec<u32>,
-    ) {
-        if value_range.start() > value_range.end() {
-            return;
-        }
-        let position_range = position_range.start..position_range.end.min(self.num_vals());
-        let from_value = *value_range.start();
-        let to_value = *value_range.end();
-        assert!(to_value >= from_value);
-        let compact_from = self.u128_to_compact(from_value);
-        let compact_to = self.u128_to_compact(to_value);
-
-        // Quick return, if both ranges fall into the same non-mapped space, the range can't cover
-        // any values, so we can early exit
-        match (compact_to, compact_from) {
-            (Err(pos1), Err(pos2)) if pos1 == pos2 => return,
-            _ => {}
-        }
-
-        let compact_from = compact_from.unwrap_or_else(|pos| {
-            // Correctness: Out of bounds, if this value is Err(last_index + 1), we early exit,
-            // since the to_value also mapps into the same non-mapped space
-            let range_mapping = self.params.compact_space.get_range_mapping(pos);
-            range_mapping.compact_start
-        });
-        // If there is no compact space, we go to the closest upperbound compact space
-        let compact_to = compact_to.unwrap_or_else(|pos| {
-            // Correctness: Overflow, if this value is Err(0), we early exit,
-            // since the from_value also mapps into the same non-mapped space
-
-            // Get end of previous range
-            let pos = pos - 1;
-            let range_mapping = self.params.compact_space.get_range_mapping(pos);
-            range_mapping.compact_end()
-        });
-
-        let range = compact_from..=compact_to;
-
-        let scan_num_docs = position_range.end - position_range.start;
-
-        let step_size = 4;
-        let cutoff = position_range.start + scan_num_docs - scan_num_docs % step_size;
-
-        let mut push_if_in_range = |idx, val| {
-            if range.contains(&val) {
-                positions.push(idx);
-            }
-        };
-        let get_val = |idx| self.params.bit_unpacker.get(idx, &self.data);
-        // unrolled loop
-        for idx in (position_range.start..cutoff).step_by(step_size as usize) {
-            let idx1 = idx;
-            let idx2 = idx + 1;
-            let idx3 = idx + 2;
-            let idx4 = idx + 3;
-            let val1 = get_val(idx1);
-            let val2 = get_val(idx2);
-            let val3 = get_val(idx3);
-            let val4 = get_val(idx4);
-            push_if_in_range(idx1, val1);
-            push_if_in_range(idx2, val2);
-            push_if_in_range(idx3, val3);
-            push_if_in_range(idx4, val4);
-        }
-
-        // handle rest
-        for idx in cutoff..position_range.end {
-            push_if_in_range(idx, get_val(idx));
-        }
-    }
-
-    #[inline]
-    fn iter_compact(&self) -> impl Iterator<Item = u64> + '_ {
-        (0..self.params.num_vals).map(move |idx| self.params.bit_unpacker.get(idx, &self.data))
-    }
-
-    #[inline]
-    fn iter(&self) -> impl Iterator<Item = u128> + '_ {
-        // TODO: Performance. It would be better to iterate on the ranges and check existence via
-        // the bit_unpacker.
-        self.iter_compact()
-            .map(|compact| self.compact_to_u128(compact))
-    }
-
-    #[inline]
-    pub fn get(&self, idx: u32) -> u128 {
-        let compact = self.params.bit_unpacker.get(idx, &self.data);
-        self.compact_to_u128(compact)
-    }
-
-    pub fn min_value(&self) -> u128 {
-        self.params.min_value
-    }
-
-    pub fn max_value(&self) -> u128 {
-        self.params.max_value
-    }
-}
-
-// TODO reenable what can be reenabled.
-// #[cfg(test)]
-// mod tests {
-//
-// use super::*;
-// use crate::column::format_version::read_format_version;
-// use crate::column::column_footer::read_null_index_footer;
-// use crate::column::serialize::U128Header;
-// use crate::column::{open_u128, serialize_u128};
-//
-// #[test]
-// fn compact_space_test() {
-// let ips = &[
-// 2u128, 4u128, 1000, 1001, 1002, 1003, 1004, 1005, 1008, 1010, 1012, 1260,
-// ]
-// .into_iter()
-// .collect();
-// let compact_space = get_compact_space(ips, ips.len() as u32, 11);
-// let amplitude = compact_space.amplitude_compact_space();
-// assert_eq!(amplitude, 17);
-// assert_eq!(1, compact_space.u128_to_compact(2).unwrap());
-// assert_eq!(2, compact_space.u128_to_compact(3).unwrap());
-// assert_eq!(compact_space.u128_to_compact(100).unwrap_err(), 1);
-//
-// for (num1, num2) in (0..3).tuple_windows() {
-// assert_eq!(
-// compact_space.get_range_mapping(num1).compact_end() + 1,
-// compact_space.get_range_mapping(num2).compact_start
-// );
-// }
-//
-// let mut output: Vec<u8> = Vec::new();
-// compact_space.serialize(&mut output).unwrap();
-//
-// assert_eq!(
-// compact_space,
-// CompactSpace::deserialize(&mut &output[..]).unwrap()
-// );
-//
-// for ip in ips {
-// let compact = compact_space.u128_to_compact(*ip).unwrap();
-// assert_eq!(compact_space.compact_to_u128(compact), *ip);
-// }
-// }
-//
-// #[test]
-// fn compact_space_amplitude_test() {
-// let ips = &[100000u128, 1000000].into_iter().collect();
-// let compact_space = get_compact_space(ips, ips.len() as u32, 1);
-// let amplitude = compact_space.amplitude_compact_space();
-// assert_eq!(amplitude, 2);
-// }
-//
-// fn test_all(mut data: OwnedBytes, expected: &[u128]) {
-// let _header = U128Header::deserialize(&mut data);
-// let decompressor = CompactSpaceDecompressor::open(data).unwrap();
-// for (idx, expected_val) in expected.iter().cloned().enumerate() {
-// let val = decompressor.get(idx as u32);
-// assert_eq!(val, expected_val);
-//
-// let test_range = |range: RangeInclusive<u128>| {
-// let expected_positions = expected
-// .iter()
-// .positions(|val| range.contains(val))
-// .map(|pos| pos as u32)
-// .collect::<Vec<_>>();
-// let mut positions = Vec::new();
-// decompressor.get_positions_for_value_range(
-// range,
-// 0..decompressor.num_vals(),
-// &mut positions,
-// );
-// assert_eq!(positions, expected_positions);
-// };
-//
-// test_range(expected_val.saturating_sub(1)..=expected_val);
-// test_range(expected_val..=expected_val);
-// test_range(expected_val..=expected_val.saturating_add(1));
-// test_range(expected_val.saturating_sub(1)..=expected_val.saturating_add(1));
-// }
-// }
-//
-// fn test_aux_vals(u128_vals: &[u128]) -> OwnedBytes {
-// let mut out = Vec::new();
-// serialize_u128(
-// || u128_vals.iter().cloned(),
-// u128_vals.len() as u32,
-// &mut out,
-// )
-// .unwrap();
-//
-// let data = OwnedBytes::new(out);
-// let (data, _format_version) = read_format_version(data).unwrap();
-// let (data, _null_index_footer) = read_null_index_footer(data).unwrap();
-// test_all(data.clone(), u128_vals);
-//
-// data
-// }
-//
-// #[test]
-// fn test_range_1() {
-// let vals = &[
-// 1u128,
-// 100u128,
-// 3u128,
-// 99999u128,
-// 100000u128,
-// 100001u128,
-// 4_000_211_221u128,
-// 4_000_211_222u128,
-// 333u128,
-// ];
-// let mut data = test_aux_vals(vals);
-//
-// let _header = U128Header::deserialize(&mut data);
-// let decomp = CompactSpaceDecompressor::open(data).unwrap();
-// let complete_range = 0..vals.len() as u32;
-// for (pos, val) in vals.iter().enumerate() {
-// let val = *val;
-// let pos = pos as u32;
-// let mut positions = Vec::new();
-// decomp.get_positions_for_value_range(val..=val, pos..pos + 1, &mut positions);
-// assert_eq!(positions, vec![pos]);
-// }
-//
-// handle docid range out of bounds
-// let positions: Vec<u32> = get_positions_for_value_range_helper(&decomp, 0..=1, 1..u32::MAX);
-// assert!(positions.is_empty());
-//
-// let positions =
-// get_positions_for_value_range_helper(&decomp, 0..=1, complete_range.clone());
-// assert_eq!(positions, vec![0]);
-// let positions =
-// get_positions_for_value_range_helper(&decomp, 0..=2, complete_range.clone());
-// assert_eq!(positions, vec![0]);
-// let positions =
-// get_positions_for_value_range_helper(&decomp, 0..=3, complete_range.clone());
-// assert_eq!(positions, vec![0, 2]);
-// assert_eq!(
-// get_positions_for_value_range_helper(
-// &decomp,
-// 99999u128..=99999u128,
-// complete_range.clone()
-// ),
-// vec![3]
-// );
-// assert_eq!(
-// get_positions_for_value_range_helper(
-// &decomp,
-// 99999u128..=100000u128,
-// complete_range.clone()
-// ),
-// vec![3, 4]
-// );
-// assert_eq!(
-// get_positions_for_value_range_helper(
-// &decomp,
-// 99998u128..=100000u128,
-// complete_range.clone()
-// ),
-// vec![3, 4]
-// );
-// assert_eq!(
-// &get_positions_for_value_range_helper(
-// &decomp,
-// 99998u128..=99999u128,
-// complete_range.clone()
-// ),
-// &[3]
-// );
-// assert!(get_positions_for_value_range_helper(
-// &decomp,
-// 99998u128..=99998u128,
-// complete_range.clone()
-// )
-// .is_empty());
-// assert_eq!(
-// &get_positions_for_value_range_helper(
-// &decomp,
-// 333u128..=333u128,
-// complete_range.clone()
-// ),
-// &[8]
-// );
-// assert_eq!(
-// &get_positions_for_value_range_helper(
-// &decomp,
-// 332u128..=333u128,
-// complete_range.clone()
-// ),
-// &[8]
-// );
-// assert_eq!(
-// &get_positions_for_value_range_helper(
-// &decomp,
-// 332u128..=334u128,
-// complete_range.clone()
-// ),
-// &[8]
-// );
-// assert_eq!(
-// &get_positions_for_value_range_helper(
-// &decomp,
-// 333u128..=334u128,
-// complete_range.clone()
-// ),
-// &[8]
-// );
-//
-// assert_eq!(
-// &get_positions_for_value_range_helper(
-// &decomp,
-// 4_000_211_221u128..=5_000_000_000u128,
-// complete_range
-// ),
-// &[6, 7]
-// );
-// }
-//
-// #[test]
-// fn test_empty() {
-// let vals = &[];
-// let data = test_aux_vals(vals);
-// let _decomp = CompactSpaceDecompressor::open(data).unwrap();
-// }
-//
-// #[test]
-// fn test_range_2() {
-// let vals = &[
-// 100u128,
-// 99999u128,
-// 100000u128,
-// 100001u128,
-// 4_000_211_221u128,
-// 4_000_211_222u128,
-// 333u128,
-// ];
-// let mut data = test_aux_vals(vals);
-// let _header = U128Header::deserialize(&mut data);
-// let decomp = CompactSpaceDecompressor::open(data).unwrap();
-// let complete_range = 0..vals.len() as u32;
-// assert!(
-// &get_positions_for_value_range_helper(&decomp, 0..=5, complete_range.clone())
-// .is_empty(),
-// );
-// assert_eq!(
-// &get_positions_for_value_range_helper(&decomp, 0..=100, complete_range.clone()),
-// &[0]
-// );
-// assert_eq!(
-// &get_positions_for_value_range_helper(&decomp, 0..=105, complete_range),
-// &[0]
-// );
-// }
-//
-// fn get_positions_for_value_range_helper<C: Column<T> + ?Sized, T: PartialOrd>(
-// column: &C,
-// value_range: RangeInclusive<T>,
-// doc_id_range: Range<u32>,
-// ) -> Vec<u32> {
-// let mut positions = Vec::new();
-// column.get_docids_for_value_range(value_range, doc_id_range, &mut positions);
-// positions
-// }
-//
-// #[test]
-// fn test_range_3() {
-// let vals = &[
-// 200u128,
-// 201,
-// 202,
-// 203,
-// 204,
-// 204,
-// 206,
-// 207,
-// 208,
-// 209,
-// 210,
-// 1_000_000,
-// 5_000_000_000,
-// ];
-// let mut out = Vec::new();
-// serialize_u128(|| vals.iter().cloned(), vals.len() as u32, &mut out).unwrap();
-// let decomp = open_u128::<u128>(OwnedBytes::new(out)).unwrap();
-// let complete_range = 0..vals.len() as u32;
-//
-// assert_eq!(
-// get_positions_for_value_range_helper(&*decomp, 199..=200, complete_range.clone()),
-// vec![0]
-// );
-//
-// assert_eq!(
-// get_positions_for_value_range_helper(&*decomp, 199..=201, complete_range.clone()),
-// vec![0, 1]
-// );
-//
-// assert_eq!(
-// get_positions_for_value_range_helper(&*decomp, 200..=200, complete_range.clone()),
-// vec![0]
-// );
-//
-// assert_eq!(
-// get_positions_for_value_range_helper(&*decomp, 1_000_000..=1_000_000, complete_range),
-// vec![11]
-// );
-// }
-//
-// #[test]
-// fn test_bug1() {
-// let vals = &[9223372036854775806];
-// let _data = test_aux_vals(vals);
-// }
-//
-// #[test]
-// fn test_bug2() {
-// let vals = &[340282366920938463463374607431768211455u128];
-// let _data = test_aux_vals(vals);
-// }
-//
-// #[test]
-// fn test_bug3() {
-// let vals = &[340282366920938463463374607431768211454];
-// let _data = test_aux_vals(vals);
-// }
-//
-// #[test]
-// fn test_bug4() {
-// let vals = &[340282366920938463463374607431768211455, 0];
-// let _data = test_aux_vals(vals);
-// }
-//
-// #[test]
-// fn test_first_large_gaps() {
-// let vals = &[1_000_000_000u128; 100];
-// let _data = test_aux_vals(vals);
-// }
-// use itertools::Itertools;
-// use proptest::prelude::*;
-//
-// fn num_strategy() -> impl Strategy<Value = u128> {
-// prop_oneof![
-// 1 => prop::num::u128::ANY.prop_map(|num| u128::MAX - (num % 10) ),
-// 1 => prop::num::u128::ANY.prop_map(|num| i64::MAX as u128 + 5 - (num % 10) ),
-// 1 => prop::num::u128::ANY.prop_map(|num| i128::MAX as u128 + 5 - (num % 10) ),
-// 1 => prop::num::u128::ANY.prop_map(|num| num % 10 ),
-// 20 => prop::num::u128::ANY,
-// ]
-// }
-//
-// proptest! {
-// #![proptest_config(ProptestConfig::with_cases(10))]
-//
-// #[test]
-// fn compress_decompress_random(vals in proptest::collection::vec(num_strategy()
-// , 1..1000)) {
-// let _data = test_aux_vals(&vals);
-// }
-// }
-// }
-//
--- a/columnar/src/column_values/gcd.rs
+++ b/columnar/src/column_values/gcd.rs
@@ -1,75 +0,0 @@
-use std::num::NonZeroU64;
-
-use fastdivide::DividerU64;
-
-/// Compute the gcd of two non null numbers.
-///
-/// It is recommended, but not required, to feed values such that `large >= small`.
-fn compute_gcd(mut large: NonZeroU64, mut small: NonZeroU64) -> NonZeroU64 {
-    loop {
-        let rem: u64 = large.get() % small;
-        if let Some(new_small) = NonZeroU64::new(rem) {
-            (large, small) = (small, new_small);
-        } else {
-            return small;
-        }
-    }
-}
-
-// Find GCD for iterator of numbers
-pub fn find_gcd(numbers: impl Iterator<Item = u64>) -> Option<NonZeroU64> {
-    let mut numbers = numbers.flat_map(NonZeroU64::new);
-    let mut gcd: NonZeroU64 = numbers.next()?;
-    if gcd.get() == 1 {
-        return Some(gcd);
-    }
-
-    let mut gcd_divider = DividerU64::divide_by(gcd.get());
-    for val in numbers {
-        let remainder = val.get() - (gcd_divider.divide(val.get())) * gcd.get();
-        if remainder == 0 {
-            continue;
-        }
-        gcd = compute_gcd(val, gcd);
-        if gcd.get() == 1 {
-            return Some(gcd);
-        }
-
-        gcd_divider = DividerU64::divide_by(gcd.get());
-    }
-    Some(gcd)
-}
-
-#[cfg(test)]
-mod tests {
-    use std::num::NonZeroU64;
-
-    use crate::column_values::gcd::{compute_gcd, find_gcd};
-
-    #[test]
-    fn test_compute_gcd() {
-        let test_compute_gcd_aux = |large, small, expected| {
-            let large = NonZeroU64::new(large).unwrap();
-            let small = NonZeroU64::new(small).unwrap();
-            let expected = NonZeroU64::new(expected).unwrap();
-            assert_eq!(compute_gcd(small, large), expected);
-            assert_eq!(compute_gcd(large, small), expected);
-        };
-        test_compute_gcd_aux(1, 4, 1);
-        test_compute_gcd_aux(2, 4, 2);
-        test_compute_gcd_aux(10, 25, 5);
-        test_compute_gcd_aux(25, 25, 25);
-    }
-
-    #[test]
-    fn find_gcd_test() {
-        assert_eq!(find_gcd([0].into_iter()), None);
-        assert_eq!(find_gcd([0, 10].into_iter()), NonZeroU64::new(10));
-        assert_eq!(find_gcd([10, 0].into_iter()), NonZeroU64::new(10));
-        assert_eq!(find_gcd([].into_iter()), None);
-        assert_eq!(find_gcd([15, 30, 5, 10].into_iter()), NonZeroU64::new(5));
-        assert_eq!(find_gcd([15, 16, 10].into_iter()), NonZeroU64::new(1));
-        assert_eq!(find_gcd([0, 5, 5, 5].into_iter()), NonZeroU64::new(5));
-        assert_eq!(find_gcd([0, 0].into_iter()), None);
-    }
-}
--- a/columnar/src/column_values/linear.rs
+++ b/columnar/src/column_values/linear.rs
@@ -1,230 +0,0 @@
-use std::io::{self, Write};
-
-use common::{BinarySerializable, OwnedBytes};
-use tantivy_bitpacker::{compute_num_bits, BitPacker, BitUnpacker};
-
-use super::line::Line;
-use super::serialize::NormalizedHeader;
-use super::{ColumnValues, FastFieldCodec, FastFieldCodecType};
-
-/// Depending on the field type, a different
-/// fast field is required.
-#[derive(Clone)]
-pub struct LinearReader {
-    data: OwnedBytes,
-    linear_params: LinearParams,
-    header: NormalizedHeader,
-}
-
-impl ColumnValues for LinearReader {
-    #[inline]
-    fn get_val(&self, doc: u32) -> u64 {
-        let interpoled_val: u64 = self.linear_params.line.eval(doc);
-        let bitpacked_diff = self.linear_params.bit_unpacker.get(doc, &self.data);
-        interpoled_val.wrapping_add(bitpacked_diff)
-    }
-
-    #[inline(always)]
-    fn min_value(&self) -> u64 {
-        // The LinearReader assumes a normalized vector.
-        0u64
-    }
-
-    #[inline(always)]
-    fn max_value(&self) -> u64 {
-        self.header.max_value
-    }
-
-    #[inline]
-    fn num_vals(&self) -> u32 {
-        self.header.num_vals
-    }
-}
-
-/// Fastfield serializer, which tries to guess values by linear interpolation
-/// and stores the difference bitpacked.
-pub struct LinearCodec;
-
-#[derive(Debug, Clone)]
-struct LinearParams {
-    line: Line,
-    bit_unpacker: BitUnpacker,
-}
-
-impl BinarySerializable for LinearParams {
-    fn serialize<W: io::Write>(&self, writer: &mut W) -> io::Result<()> {
-        self.line.serialize(writer)?;
-        self.bit_unpacker.bit_width().serialize(writer)?;
-        Ok(())
-    }
-
-    fn deserialize<R: io::Read>(reader: &mut R) -> io::Result<Self> {
-        let line = Line::deserialize(reader)?;
-        let bit_width = u8::deserialize(reader)?;
-        Ok(Self {
-            line,
-            bit_unpacker: BitUnpacker::new(bit_width),
-        })
-    }
-}
-
-impl FastFieldCodec for LinearCodec {
-    const CODEC_TYPE: FastFieldCodecType = FastFieldCodecType::Linear;
-
-    type Reader = LinearReader;
-
-    /// Opens a fast field given a file.
-    fn open_from_bytes(mut data: OwnedBytes, header: NormalizedHeader) -> io::Result<Self::Reader> {
-        let linear_params = LinearParams::deserialize(&mut data)?;
-        Ok(LinearReader {
-            data,
-            linear_params,
-            header,
-        })
-    }
-
-    /// Creates a new fast field serializer.
-    fn serialize(column: &dyn ColumnValues, write: &mut impl Write) -> io::Result<()> {
-        assert_eq!(column.min_value(), 0);
-        let line = Line::train(column);
-
-        let max_offset_from_line = column
-            .iter()
-            .enumerate()
-            .map(|(pos, actual_value)| {
-                let calculated_value = line.eval(pos as u32);
-                actual_value.wrapping_sub(calculated_value)
-            })
-            .max()
-            .unwrap();
-
-        let num_bits = compute_num_bits(max_offset_from_line);
-        let linear_params = LinearParams {
-            line,
-            bit_unpacker: BitUnpacker::new(num_bits),
-        };
-        linear_params.serialize(write)?;
-
-        let mut bit_packer = BitPacker::new();
-        for (pos, actual_value) in column.iter().enumerate() {
-            let calculated_value = line.eval(pos as u32);
-            let offset = actual_value.wrapping_sub(calculated_value);
-            bit_packer.write(offset, num_bits, write)?;
-        }
-        bit_packer.close(write)?;
-
-        Ok(())
-    }
-
-    /// estimation for linear interpolation is hard because, you don't know
-    /// where the local maxima for the deviation of the calculated value are and
-    /// the offset to shift all values to >=0 is also unknown.
-    #[allow(clippy::question_mark)]
-    fn estimate(column: &dyn ColumnValues) -> Option<f32> {
-        if column.num_vals() < 3 {
-            return None; // disable compressor for this case
-        }
-
-        let limit_num_vals = column.num_vals().min(100_000);
-
-        let num_samples = 100;
-        let step_size = (limit_num_vals / num_samples).max(1); // 20 samples
-        let mut sample_positions_and_values: Vec<_> = Vec::new();
-        for (pos, val) in column.iter().enumerate().step_by(step_size as usize) {
-            sample_positions_and_values.push((pos as u64, val));
-        }
-
-        let line = Line::estimate(&sample_positions_and_values);
-
-        let estimated_bit_width = sample_positions_and_values
-            .into_iter()
-            .map(|(pos, actual_value)| {
-                let interpolated_val = line.eval(pos as u32);
-                actual_value.wrapping_sub(interpolated_val)
-            })
-            .map(|diff| ((diff as f32 * 1.5) * 2.0) as u64)
-            .map(compute_num_bits)
-            .max()
-            .unwrap_or(0);
-
-        // Extrapolate to whole column
-        let num_bits = (estimated_bit_width as u64 * column.num_vals() as u64) + 64;
-        let num_bits_uncompressed = 64 * column.num_vals();
-        Some(num_bits as f32 / num_bits_uncompressed as f32)
-    }
-}
-
-#[cfg(test)]
-mod tests {
-    use rand::RngCore;
-
-    use super::*;
-    use crate::column_values::tests;
-
-    fn create_and_validate(data: &[u64], name: &str) -> Option<(f32, f32)> {
-        tests::create_and_validate::<LinearCodec>(data, name)
-    }
-
-    #[test]
-    fn test_compression() {
-        let data = (10..=6_000_u64).collect::<Vec<_>>();
-        let (estimate, actual_compression) =
-            create_and_validate(&data, "simple monotonically large").unwrap();
-
-        assert_le!(actual_compression, 0.001);
-        assert_le!(estimate, 0.02);
-    }
-
-    #[test]
-    fn test_with_codec_datasets() {
-        let data_sets = tests::get_codec_test_datasets();
-        for (mut data, name) in data_sets {
-            create_and_validate(&data, name);
-            data.reverse();
-            create_and_validate(&data, name);
-        }
-    }
-    #[test]
-    fn linear_interpol_fast_field_test_large_amplitude() {
-        let data = vec![
-            i64::MAX as u64 / 2,
-            i64::MAX as u64 / 3,
-            i64::MAX as u64 / 2,
-        ];
-
-        create_and_validate(&data, "large amplitude");
-    }
-
-    #[test]
-    fn overflow_error_test() {
-        let data = vec![1572656989877777, 1170935903116329, 720575940379279, 0];
-        create_and_validate(&data, "overflow test");
-    }
-
-    #[test]
-    fn linear_interpol_fast_concave_data() {
-        let data = vec![0, 1, 2, 5, 8, 10, 20, 50];
-        create_and_validate(&data, "concave data");
-    }
-    #[test]
-    fn linear_interpol_fast_convex_data() {
-        let data = vec![0, 40, 60, 70, 75, 77];
-        create_and_validate(&data, "convex data");
-    }
-    #[test]
-    fn linear_interpol_fast_field_test_simple() {
-        let data = (10..=20_u64).collect::<Vec<_>>();
-        create_and_validate(&data, "simple monotonically");
-    }
-
-    #[test]
-    fn linear_interpol_fast_field_rand() {
-        let mut rng = rand::thread_rng();
-        for _ in 0..50 {
-            let mut data = (0..10_000).map(|_| rng.next_u64()).collect::<Vec<_>>();
-            create_and_validate(&data, "random");
-            data.reverse();
-            create_and_validate(&data, "random");
-        }
-    }
-}
--- a/columnar/src/column_values/main.rs
+++ b/columnar/src/column_values/main.rs
@@ -1,222 +0,0 @@
-#[macro_use]
-extern crate prettytable;
-use std::collections::HashSet;
-use std::env;
-use std::io::BufRead;
-use std::net::{IpAddr, Ipv6Addr};
-use std::str::FromStr;
-
-use common::OwnedBytes;
-use fastfield_codecs::{open_u128, serialize_u128, Column, FastFieldCodecType, VecColumn};
-use itertools::Itertools;
-use measure_time::print_time;
-use prettytable::{Cell, Row, Table};
-
-fn print_set_stats(ip_addrs: &[u128]) {
-    println!("NumIps\t{}", ip_addrs.len());
-    let ip_addr_set: HashSet<u128> = ip_addrs.iter().cloned().collect();
-    println!("NumUniqueIps\t{}", ip_addr_set.len());
-    let ratio_unique = ip_addr_set.len() as f64 / ip_addrs.len() as f64;
-    println!("RatioUniqueOverTotal\t{ratio_unique:.4}");
-
-    // histogram
-    let mut ip_addrs = ip_addrs.to_vec();
-    ip_addrs.sort();
-    let mut cnts: Vec<usize> = ip_addrs
-        .into_iter()
-        .dedup_with_count()
-        .map(|(cnt, _)| cnt)
-        .collect();
-    cnts.sort();
-
-    let top_256_cnt: usize = cnts.iter().rev().take(256).sum();
-    let top_128_cnt: usize = cnts.iter().rev().take(128).sum();
-    let top_64_cnt: usize = cnts.iter().rev().take(64).sum();
-    let top_8_cnt: usize = cnts.iter().rev().take(8).sum();
-    let total: usize = cnts.iter().sum();
-
-    println!("{}", total);
-    println!("{}", top_256_cnt);
-    println!("{}", top_128_cnt);
-    println!("Percentage Top8 {:02}", top_8_cnt as f32 / total as f32);
-    println!("Percentage Top64 {:02}", top_64_cnt as f32 / total as f32);
-    println!("Percentage Top128 {:02}", top_128_cnt as f32 / total as f32);
-    println!("Percentage Top256 {:02}", top_256_cnt as f32 / total as f32);
-
-    let mut cnts: Vec<(usize, usize)> = cnts.into_iter().dedup_with_count().collect();
-    cnts.sort_by(|a, b| {
-        if a.1 == b.1 {
-            a.0.cmp(&b.0)
-        } else {
-            b.1.cmp(&a.1)
-        }
-    });
-}
-
-fn ip_dataset() -> Vec<u128> {
-    let mut ip_addr_v4 = 0;
-
-    let stdin = std::io::stdin();
-    let ip_addrs: Vec<u128> = stdin
-        .lock()
-        .lines()
-        .flat_map(|line| {
-            let line = line.unwrap();
-            let line = line.trim();
-            let ip_addr = IpAddr::from_str(line.trim()).ok()?;
-            if ip_addr.is_ipv4() {
-                ip_addr_v4 += 1;
-            }
-            let ip_addr_v6: Ipv6Addr = match ip_addr {
-                IpAddr::V4(v4) => v4.to_ipv6_mapped(),
-                IpAddr::V6(v6) => v6,
-            };
-            Some(ip_addr_v6)
-        })
-        .map(|ip_v6| u128::from_be_bytes(ip_v6.octets()))
-        .collect();
-
-    println!("IpAddrsAny\t{}", ip_addrs.len());
-    println!("IpAddrsV4\t{}", ip_addr_v4);
-
-    ip_addrs
-}
-
-fn bench_ip() {
-    let dataset = ip_dataset();
-    print_set_stats(&dataset);
-
-    // Chunks
-    {
-        let mut data = vec![];
-        for dataset in dataset.chunks(500_000) {
-            serialize_u128(|| dataset.iter().cloned(), dataset.len() as u32, &mut data).unwrap();
-        }
-        let compression = data.len() as f64 / (dataset.len() * 16) as f64;
-        println!("Compression 50_000 chunks {:.4}", compression);
-        println!(
-            "Num Bits per elem {:.2}",
-            (data.len() * 8) as f32 / dataset.len() as f32
-        );
-    }
-
-    let mut data = vec![];
-    {
-        print_time!("creation");
-        serialize_u128(|| dataset.iter().cloned(), dataset.len() as u32, &mut data).unwrap();
-    }
-
-    let compression = data.len() as f64 / (dataset.len() * 16) as f64;
-    println!("Compression {:.2}", compression);
-    println!(
-        "Num Bits per elem {:.2}",
-        (data.len() * 8) as f32 / dataset.len() as f32
-    );
-
-    let decompressor = open_u128::<u128>(OwnedBytes::new(data)).unwrap();
-    // Sample some ranges
-    let mut doc_values = Vec::new();
-    for value in dataset.iter().take(1110).skip(1100).cloned() {
-        doc_values.clear();
-        print_time!("get range");
-        decompressor.get_docids_for_value_range(
-            value..=value,
-            0..decompressor.num_vals(),
-            &mut doc_values,
-        );
-        println!("{:?}", doc_values.len());
-    }
-}
-
-fn main() {
-    if env::args().nth(1).unwrap() == "bench_ip" {
-        bench_ip();
-        return;
-    }
-
-    let mut table = Table::new();
-
-    // Add a row per time
-    table.add_row(row!["", "Compression Ratio", "Compression Estimation"]);
-
-    for (data, data_set_name) in get_codec_test_data_sets() {
-        let results: Vec<(f32, f32, FastFieldCodecType)> = [
-            serialize_with_codec(&data, FastFieldCodecType::Bitpacked),
-            serialize_with_codec(&data, FastFieldCodecType::Linear),
-            serialize_with_codec(&data, FastFieldCodecType::BlockwiseLinear),
-        ]
-        .into_iter()
-        .flatten()
-        .collect();
-        let best_compression_ratio_codec = results
-            .iter()
-            .min_by(|&res1, &res2| res1.partial_cmp(res2).unwrap())
-            .cloned()
-            .unwrap();
-
-        table.add_row(Row::new(vec![Cell::new(data_set_name).style_spec("Bbb")]));
-        for (est, comp, codec_type) in results {
-            let est_cell = est.to_string();
-            let ratio_cell = comp.to_string();
-            let style = if comp == best_compression_ratio_codec.1 {
-                "Fb"
-            } else {
-                ""
-            };
-            table.add_row(Row::new(vec![
-                Cell::new(&format!("{codec_type:?}")).style_spec("bFg"),
-                Cell::new(&ratio_cell).style_spec(style),
-                Cell::new(&est_cell).style_spec(""),
-            ]));
-        }
-    }
-
-    table.printstd();
-}
-
-pub fn get_codec_test_data_sets() -> Vec<(Vec<u64>, &'static str)> {
-    let mut data_and_names = vec![];
-
-    let data = (1000..=200_000_u64).collect::<Vec<_>>();
-    data_and_names.push((data, "Autoincrement"));
-
-    let mut current_cumulative = 0;
-    let data = (1..=200_000_u64)
-        .map(|num| {
-            let num = (num as f32 + num as f32).log10() as u64;
-            current_cumulative += num;
-            current_cumulative
-        })
-        .collect::<Vec<_>>();
-    // let data = (1..=200000_u64).map(|num| num + num).collect::<Vec<_>>();
-    data_and_names.push((data, "Monotonically increasing concave"));
-
-    let mut current_cumulative = 0;
-    let data = (1..=200_000_u64)
-        .map(|num| {
-            let num = (200_000.0 - num as f32).log10() as u64;
-            current_cumulative += num;
-            current_cumulative
-        })
-        .collect::<Vec<_>>();
-    data_and_names.push((data, "Monotonically increasing convex"));
-
-    let data = (1000..=200_000_u64)
-        .map(|num| num + rand::random::<u8>() as u64)
-        .collect::<Vec<_>>();
-    data_and_names.push((data, "Almost monotonically increasing"));
-
-    data_and_names
-}
-
-pub fn serialize_with_codec(
-    data: &[u64],
-    codec_type: FastFieldCodecType,
-) -> Option<(f32, f32, FastFieldCodecType)> {
-    let col = VecColumn::from(data);
-    let estimation = fastfield_codecs::estimate(&col, codec_type)?;
-    let mut out = Vec::new();
-    fastfield_codecs::serialize(&col, &mut out, &[codec_type]).ok()?;
-    let actual_compression = out.len() as f32 / (col.num_vals() * 8) as f32;
-    Some((estimation, actual_compression, codec_type))
-}
--- a/columnar/src/column_values/merge.rs
+++ b/columnar/src/column_values/merge.rs
@@ -0,0 +1,40 @@
+use std::fmt::Debug;
+use std::sync::Arc;
+
+use crate::iterable::Iterable;
+use crate::{ColumnIndex, ColumnValues, MergeRowOrder};
+
+pub(crate) struct MergedColumnValues<'a, T> {
+    pub(crate) column_indexes: &'a [ColumnIndex],
+    pub(crate) column_values: &'a [Option<Arc<dyn ColumnValues<T>>>],
+    pub(crate) merge_row_order: &'a MergeRowOrder,
+}
+
+impl<'a, T: Copy + PartialOrd + Debug> Iterable<T> for MergedColumnValues<'a, T> {
+    fn boxed_iter(&self) -> Box<dyn Iterator<Item = T> + '_> {
+        match self.merge_row_order {
+            MergeRowOrder::Stack(_) => Box::new(
+                self.column_values
+                    .iter()
+                    .flatten()
+                    .flat_map(|column_value| column_value.iter()),
+            ),
+            MergeRowOrder::Shuffled(shuffle_merge_order) => Box::new(
+                shuffle_merge_order
+                    .iter_new_to_old_row_addrs()
+                    .flat_map(|row_addr| {
+                        let column_index = &self.column_indexes[row_addr.segment_ord as usize];
+                        let column_values =
+                            self.column_values[row_addr.segment_ord as usize].as_ref()?;
+                        let value_range = column_index.value_row_ids(row_addr.row_id);
+                        Some((value_range, column_values))
+                    })
+                    .flat_map(|(value_range, column_values)| {
+                        value_range
+                            .into_iter()
+                            .map(|val| column_values.get_val(val))
+                    }),
+            ),
+        }
+    }
+}
--- a/columnar/src/column_values/mod.rs
+++ b/columnar/src/column_values/mod.rs
@@ -1,326 +1,208 @@
 #![warn(missing_docs)]
-#![cfg_attr(all(feature = "unstable", test), feature(test))]

 //! # `fastfield_codecs`
 //!
-//! - Columnar storage of data for tantivy [`Column`].
+//! - Columnar storage of data for tantivy [`crate::Column`].
 //! - Encode data in different codecs.
 //! - Monotonically map values to u64/u128

-#[cfg(test)]
-mod tests;
-
-use std::io;
-use std::io::Write;
+use std::fmt::Debug;
+use std::ops::{Range, RangeInclusive};
 use std::sync::Arc;

-use common::{BinarySerializable, OwnedBytes};
-use compact_space::CompactSpaceDecompressor;
-use monotonic_mapping::{
-    StrictlyMonotonicMappingInverter, StrictlyMonotonicMappingToInternal,
-    StrictlyMonotonicMappingToInternalBaseval, StrictlyMonotonicMappingToInternalGCDBaseval,
-};
-use serialize::{Header, U128Header};
+pub use monotonic_mapping::{MonotonicallyMappableToU64, StrictlyMonotonicFn};
+pub use monotonic_mapping_u128::MonotonicallyMappableToU128;

-mod bitpacked;
-mod blockwise_linear;
-mod compact_space;
-mod line;
-mod linear;
+mod merge;
 pub(crate) mod monotonic_mapping;
 pub(crate) mod monotonic_mapping_u128;
+mod stats;
+mod u128_based;
+mod u64_based;
+mod vec_column;

-mod column;
-mod column_with_cardinality;
-mod gcd;
-pub mod serialize;
+mod monotonic_column;

-pub use self::column::{monotonic_map_column, ColumnValues, IterColumn, VecColumn};
-pub use self::monotonic_mapping::{MonotonicallyMappableToU64, StrictlyMonotonicFn};
-pub use self::monotonic_mapping_u128::MonotonicallyMappableToU128;
-#[cfg(test)]
-pub use self::serialize::tests::serialize_and_load;
-pub use self::serialize::{serialize_column_values, NormalizedHeader};
-use crate::column_values::bitpacked::BitpackedCodec;
-use crate::column_values::blockwise_linear::BlockwiseLinearCodec;
-use crate::column_values::linear::LinearCodec;
+pub(crate) use merge::MergedColumnValues;
+pub use stats::ColumnStats;
+pub use u128_based::{open_u128_mapped, serialize_column_values_u128};
+pub use u64_based::{
+    load_u64_based_column_values, serialize_and_load_u64_based_column_values,
+    serialize_u64_based_column_values, CodecType, ALL_U64_CODEC_TYPES,
+};
+pub use vec_column::VecColumn;

-#[derive(PartialEq, Eq, PartialOrd, Ord, Debug, Clone, Copy)]
-#[repr(u8)]
-/// Available codecs to use to encode the u64 (via [`MonotonicallyMappableToU64`]) converted data.
-pub enum FastFieldCodecType {
-    /// Bitpack all values in the value range. The number of bits is defined by the amplitude
-    /// `column.max_value() - column.min_value()`
-    Bitpacked = 1,
-    /// Linear interpolation puts a line between the first and last value and then bitpacks the
-    /// values by the offset from the line. The number of bits is defined by the max deviation from
-    /// the line.
-    Linear = 2,
-    /// Same as [`FastFieldCodecType::Linear`], but encodes in blocks of 512 elements.
-    BlockwiseLinear = 3,
-}
+pub use self::monotonic_column::monotonic_map_column;
+use crate::RowId;

-impl BinarySerializable for FastFieldCodecType {
-    fn serialize<W: Write>(&self, wrt: &mut W) -> io::Result<()> {
-        self.to_code().serialize(wrt)
-    }
+/// `ColumnValues` provides access to a dense field column.
+///
+/// `Column` are just a wrapper over `ColumnValues` and a `ColumnIndex`.
+///
+/// Any methods with a default and specialized implementation need to be called in the
+/// wrappers that implement the trait: Arc and MonotonicMappingColumn
+pub trait ColumnValues<T: PartialOrd = u64>: Send + Sync {
+    /// Return the value associated with the given idx.
+    ///
+    /// This accessor should return as fast as possible.
+    ///
+    /// # Panics
+    ///
+    /// May panic if `idx` is greater than the column length.
+    fn get_val(&self, idx: u32) -> T;

-    fn deserialize<R: io::Read>(reader: &mut R) -> io::Result<Self> {
-        let code = u8::deserialize(reader)?;
-        let codec_type: Self = Self::from_code(code)
-            .ok_or_else(|| io::Error::new(io::ErrorKind::InvalidData, "Unknown code `{code}.`"))?;
-        Ok(codec_type)
-    }
-}
+    /// Allows to push down multiple fetch calls, to avoid dynamic dispatch overhead.
+    ///
+    /// idx and output should have the same length
+    ///
+    /// # Panics
+    ///
+    /// May panic if `idx` is greater than the column length.
+    fn get_vals(&self, indexes: &[u32], output: &mut [T]) {
+        assert!(indexes.len() == output.len());
+        let out_and_idx_chunks = output.chunks_exact_mut(4).zip(indexes.chunks_exact(4));
+        for (out_x4, idx_x4) in out_and_idx_chunks {
+            out_x4[0] = self.get_val(idx_x4[0]);
+            out_x4[1] = self.get_val(idx_x4[1]);
+            out_x4[2] = self.get_val(idx_x4[2]);
+            out_x4[3] = self.get_val(idx_x4[3]);
+        }

-impl FastFieldCodecType {
-    pub(crate) fn to_code(self) -> u8 {
-        self as u8
-    }
+        let step_size = 4;
+        let cutoff = indexes.len() - indexes.len() % step_size;

-    pub(crate) fn from_code(code: u8) -> Option<Self> {
-        match code {
-            1 => Some(Self::Bitpacked),
-            2 => Some(Self::Linear),
-            3 => Some(Self::BlockwiseLinear),
-            _ => None,
+        for idx in cutoff..indexes.len() {
+            output[idx] = self.get_val(indexes[idx]);
        }
    }
-}

-#[derive(PartialEq, Eq, PartialOrd, Ord, Debug, Clone, Copy)]
-#[repr(u8)]
-/// Available codecs to use to encode the u128 (via [`MonotonicallyMappableToU128`]) converted data.
-pub enum U128FastFieldCodecType {
-    /// This codec takes a large number space (u128) and reduces it to a compact number space, by
-    /// removing the holes.
-    CompactSpace = 1,
-}
-
-impl BinarySerializable for U128FastFieldCodecType {
-    fn serialize<W: Write>(&self, wrt: &mut W) -> io::Result<()> {
-        self.to_code().serialize(wrt)
-    }
-
-    fn deserialize<R: io::Read>(reader: &mut R) -> io::Result<Self> {
-        let code = u8::deserialize(reader)?;
-        let codec_type: Self = Self::from_code(code)
-            .ok_or_else(|| io::Error::new(io::ErrorKind::InvalidData, "Unknown code `{code}.`"))?;
-        Ok(codec_type)
-    }
-}
-
-impl U128FastFieldCodecType {
-    pub(crate) fn to_code(self) -> u8 {
-        self as u8
-    }
-
-    pub(crate) fn from_code(code: u8) -> Option<Self> {
-        match code {
-            1 => Some(Self::CompactSpace),
-            _ => None,
+    /// Fills an output buffer with the fast field values
+    /// associated with the `DocId` going from
+    /// `start` to `start + output.len()`.
+    ///
+    /// # Panics
+    ///
+    /// Must panic if `start + output.len()` is greater than
+    /// the segment's `maxdoc`.
+    #[inline(always)]
+    fn get_range(&self, start: u64, output: &mut [T]) {
+        for (out, idx) in output.iter_mut().zip(start..) {
+            *out = self.get_val(idx as u32);
        }
    }
-}

-/// Returns the correct codec reader wrapped in the `Arc` for the data.
-pub fn open_u128_mapped<T: MonotonicallyMappableToU128>(
-    mut bytes: OwnedBytes,
-) -> io::Result<Arc<dyn ColumnValues<T>>> {
-    let header = U128Header::deserialize(&mut bytes)?;
-    assert_eq!(header.codec_type, U128FastFieldCodecType::CompactSpace);
-    let reader = CompactSpaceDecompressor::open(bytes)?;
-
-    let inverted: StrictlyMonotonicMappingInverter<StrictlyMonotonicMappingToInternal<T>> =
-        StrictlyMonotonicMappingToInternal::<T>::new().into();
-    Ok(Arc::new(monotonic_map_column(reader, inverted)))
-}
-
-/// Returns the correct codec reader wrapped in the `Arc` for the data.
-pub fn open_u64_mapped<T: MonotonicallyMappableToU64>(
-    mut bytes: OwnedBytes,
-) -> io::Result<Arc<dyn ColumnValues<T>>> {
-    let header = Header::deserialize(&mut bytes)?;
-    match header.codec_type {
-        FastFieldCodecType::Bitpacked => open_specific_codec::<BitpackedCodec, _>(bytes, &header),
-        FastFieldCodecType::Linear => open_specific_codec::<LinearCodec, _>(bytes, &header),
-        FastFieldCodecType::BlockwiseLinear => {
-            open_specific_codec::<BlockwiseLinearCodec, _>(bytes, &header)
+    /// Get the row ids of values which are in the provided value range.
+    ///
+    /// Note that position == docid for single value fast fields
+    fn get_row_ids_for_value_range(
+        &self,
+        value_range: RangeInclusive<T>,
+        row_id_range: Range<RowId>,
+        row_id_hits: &mut Vec<RowId>,
+    ) {
+        let row_id_range = row_id_range.start..row_id_range.end.min(self.num_vals());
+        for idx in row_id_range.start..row_id_range.end {
+            let val = self.get_val(idx);
+            if value_range.contains(&val) {
+                row_id_hits.push(idx);
+            }
        }
    }
-}

-fn open_specific_codec<C: FastFieldCodec, Item: MonotonicallyMappableToU64>(
-    bytes: OwnedBytes,
-    header: &Header,
-) -> io::Result<Arc<dyn ColumnValues<Item>>> {
-    let normalized_header = header.normalized();
-    let reader = C::open_from_bytes(bytes, normalized_header)?;
-    let min_value = header.min_value;
-    if let Some(gcd) = header.gcd {
-        let mapping = StrictlyMonotonicMappingInverter::from(
-            StrictlyMonotonicMappingToInternalGCDBaseval::new(gcd.get(), min_value),
-        );
-        Ok(Arc::new(monotonic_map_column(reader, mapping)))
-    } else {
-        let mapping = StrictlyMonotonicMappingInverter::from(
-            StrictlyMonotonicMappingToInternalBaseval::new(min_value),
-        );
-        Ok(Arc::new(monotonic_map_column(reader, mapping)))
+    /// Returns a lower bound for this column of values.
+    ///
+    /// All values are guaranteed to be higher than `.min_value()`
+    /// but this value is not necessary the best boundary value.
+    ///
+    /// We have
+    /// ∀i < self.num_vals(), self.get_val(i) >= self.min_value()
+    /// But we don't have necessarily
+    /// ∃i < self.num_vals(), self.get_val(i) == self.min_value()
+    fn min_value(&self) -> T;
+
+    /// Returns an upper bound for this column of values.
+    ///
+    /// All values are guaranteed to be lower than `.max_value()`
+    /// but this value is not necessary the best boundary value.
+    ///
+    /// We have
+    /// ∀i < self.num_vals(), self.get_val(i) <= self.max_value()
+    /// But we don't have necessarily
+    /// ∃i < self.num_vals(), self.get_val(i) == self.max_value()
+    fn max_value(&self) -> T;
+
+    /// The number of values in the column.
+    fn num_vals(&self) -> u32;
+
+    /// Returns a iterator over the data
+    fn iter<'a>(&'a self) -> Box<dyn Iterator<Item = T> + 'a> {
+        Box::new((0..self.num_vals()).map(|idx| self.get_val(idx)))
    }
 }

-/// The FastFieldSerializerEstimate trait is required on all variants
-/// of fast field compressions, to decide which one to choose.
-pub(crate) trait FastFieldCodec: 'static {
-    /// A codex needs to provide a unique name and id, which is
-    /// used for debugging and de/serialization.
-    const CODEC_TYPE: FastFieldCodecType;
+/// Empty column of values.
+pub struct EmptyColumnValues;

-    type Reader: ColumnValues<u64> + 'static;
+impl<T: PartialOrd + Default> ColumnValues<T> for EmptyColumnValues {
+    fn get_val(&self, _idx: u32) -> T {
+        panic!("Internal Error: Called get_val of empty column.")
+    }

-    /// Reads the metadata and returns the CodecReader
-    fn open_from_bytes(bytes: OwnedBytes, header: NormalizedHeader) -> io::Result<Self::Reader>;
+    fn min_value(&self) -> T {
+        T::default()
+    }

-    /// Serializes the data using the serializer into write.
-    ///
-    /// The column iterator should be preferred over using column `get_val` method for
-    /// performance reasons.
-    fn serialize(column: &dyn ColumnValues, write: &mut impl Write) -> io::Result<()>;
+    fn max_value(&self) -> T {
+        T::default()
+    }

-    /// Returns an estimate of the compression ratio.
-    /// If the codec is not applicable, returns `None`.
-    ///
-    /// The baseline is uncompressed 64bit data.
-    ///
-    /// It could make sense to also return a value representing
-    /// computational complexity.
-    fn estimate(column: &dyn ColumnValues) -> Option<f32>;
+    fn num_vals(&self) -> u32 {
+        0
+    }
+}
+
+impl<T: Copy + PartialOrd + Debug> ColumnValues<T> for Arc<dyn ColumnValues<T>> {
+    #[inline(always)]
+    fn get_val(&self, idx: u32) -> T {
+        self.as_ref().get_val(idx)
+    }
+
+    #[inline(always)]
+    fn min_value(&self) -> T {
+        self.as_ref().min_value()
+    }
+
+    #[inline(always)]
+    fn max_value(&self) -> T {
+        self.as_ref().max_value()
+    }
+
+    #[inline(always)]
+    fn num_vals(&self) -> u32 {
+        self.as_ref().num_vals()
+    }
+
+    #[inline(always)]
+    fn iter<'b>(&'b self) -> Box<dyn Iterator<Item = T> + 'b> {
+        self.as_ref().iter()
+    }
+
+    #[inline(always)]
+    fn get_range(&self, start: u64, output: &mut [T]) {
+        self.as_ref().get_range(start, output)
+    }
+
+    #[inline(always)]
+    fn get_row_ids_for_value_range(
+        &self,
+        range: RangeInclusive<T>,
+        doc_id_range: Range<u32>,
+        positions: &mut Vec<u32>,
+    ) {
+        self.as_ref()
+            .get_row_ids_for_value_range(range, doc_id_range, positions)
+    }
 }

 #[cfg(all(test, feature = "unstable"))]
-mod bench {
-    use std::sync::Arc;
-
-    use common::OwnedBytes;
-    use rand::rngs::StdRng;
-    use rand::{Rng, SeedableRng};
-    use test::{self, Bencher};
-
-    use super::*;
-
-    fn get_data() -> Vec<u64> {
-        let mut rng = StdRng::seed_from_u64(2u64);
-        let mut data: Vec<_> = (100..55000_u64)
-            .map(|num| num + rng.gen::<u8>() as u64)
-            .collect();
-        data.push(99_000);
-        data.insert(1000, 2000);
-        data.insert(2000, 100);
-        data.insert(3000, 4100);
-        data.insert(4000, 100);
-        data.insert(5000, 800);
-        data
-    }
-
-    #[inline(never)]
-    fn value_iter() -> impl Iterator<Item = u64> {
-        0..20_000
-    }
-    fn get_reader_for_bench<Codec: FastFieldCodec>(data: &[u64]) -> Codec::Reader {
-        let mut bytes = Vec::new();
-        let min_value = *data.iter().min().unwrap();
-        let data = data.iter().map(|el| *el - min_value).collect::<Vec<_>>();
-        let col = VecColumn::from(&data);
-        let normalized_header = NormalizedHeader {
-            num_vals: col.num_vals(),
-            max_value: col.max_value(),
-        };
-        Codec::serialize(&VecColumn::from(&data), &mut bytes).unwrap();
-        Codec::open_from_bytes(OwnedBytes::new(bytes), normalized_header).unwrap()
-    }
-    fn bench_get<Codec: FastFieldCodec>(b: &mut Bencher, data: &[u64]) {
-        let col = get_reader_for_bench::<Codec>(data);
-        b.iter(|| {
-            let mut sum = 0u64;
-            for pos in value_iter() {
-                let val = col.get_val(pos as u32);
-                sum = sum.wrapping_add(val);
-            }
-            sum
-        });
-    }
-
-    #[inline(never)]
-    fn bench_get_dynamic_helper(b: &mut Bencher, col: Arc<dyn ColumnValues>) {
-        b.iter(|| {
-            let mut sum = 0u64;
-            for pos in value_iter() {
-                let val = col.get_val(pos as u32);
-                sum = sum.wrapping_add(val);
-            }
-            sum
-        });
-    }
-
-    fn bench_get_dynamic<Codec: FastFieldCodec>(b: &mut Bencher, data: &[u64]) {
-        let col = Arc::new(get_reader_for_bench::<Codec>(data));
-        bench_get_dynamic_helper(b, col);
-    }
-    fn bench_create<Codec: FastFieldCodec>(b: &mut Bencher, data: &[u64]) {
-        let min_value = *data.iter().min().unwrap();
-        let data = data.iter().map(|el| *el - min_value).collect::<Vec<_>>();
-
-        let mut bytes = Vec::new();
-        b.iter(|| {
-            bytes.clear();
-            Codec::serialize(&VecColumn::from(&data), &mut bytes).unwrap();
-        });
-    }
-
-    #[bench]
-    fn bench_fastfield_bitpack_create(b: &mut Bencher) {
-        let data: Vec<_> = get_data();
-        bench_create::<BitpackedCodec>(b, &data);
-    }
-    #[bench]
-    fn bench_fastfield_linearinterpol_create(b: &mut Bencher) {
-        let data: Vec<_> = get_data();
-        bench_create::<LinearCodec>(b, &data);
-    }
-    #[bench]
-    fn bench_fastfield_multilinearinterpol_create(b: &mut Bencher) {
-        let data: Vec<_> = get_data();
-        bench_create::<BlockwiseLinearCodec>(b, &data);
-    }
-    #[bench]
-    fn bench_fastfield_bitpack_get(b: &mut Bencher) {
-        let data: Vec<_> = get_data();
-        bench_get::<BitpackedCodec>(b, &data);
-    }
-    #[bench]
-    fn bench_fastfield_bitpack_get_dynamic(b: &mut Bencher) {
-        let data: Vec<_> = get_data();
-        bench_get_dynamic::<BitpackedCodec>(b, &data);
-    }
-    #[bench]
-    fn bench_fastfield_linearinterpol_get(b: &mut Bencher) {
-        let data: Vec<_> = get_data();
-        bench_get::<LinearCodec>(b, &data);
-    }
-    #[bench]
-    fn bench_fastfield_linearinterpol_get_dynamic(b: &mut Bencher) {
-        let data: Vec<_> = get_data();
-        bench_get_dynamic::<LinearCodec>(b, &data);
-    }
-    #[bench]
-    fn bench_fastfield_multilinearinterpol_get(b: &mut Bencher) {
-        let data: Vec<_> = get_data();
-        bench_get::<BlockwiseLinearCodec>(b, &data);
-    }
-    #[bench]
-    fn bench_fastfield_multilinearinterpol_get_dynamic(b: &mut Bencher) {
-        let data: Vec<_> = get_data();
-        bench_get_dynamic::<BlockwiseLinearCodec>(b, &data);
-    }
-}
+mod bench;
--- a/columnar/src/column_values/monotonic_column.rs
+++ b/columnar/src/column_values/monotonic_column.rs
@@ -0,0 +1,120 @@
+use std::fmt::Debug;
+use std::marker::PhantomData;
+use std::ops::{Range, RangeInclusive};
+
+use crate::column_values::monotonic_mapping::StrictlyMonotonicFn;
+use crate::ColumnValues;
+
+struct MonotonicMappingColumn<C, T, Input> {
+    from_column: C,
+    monotonic_mapping: T,
+    _phantom: PhantomData<Input>,
+}
+
+/// Creates a view of a column transformed by a strictly monotonic mapping. See
+/// [`StrictlyMonotonicFn`].
+///
+/// E.g. apply a gcd monotonic_mapping([100, 200, 300]) == [1, 2, 3]
+/// monotonic_mapping.mapping() is expected to be injective, and we should always have
+/// monotonic_mapping.inverse(monotonic_mapping.mapping(el)) == el
+///
+/// The inverse of the mapping is required for:
+/// `fn get_positions_for_value_range(&self, range: RangeInclusive<T>) -> Vec<u64> `
+/// The user provides the original value range and we need to monotonic map them in the same way the
+/// serialization does before calling the underlying column.
+///
+/// Note that when opening a codec, the monotonic_mapping should be the inverse of the mapping
+/// during serialization. And therefore the monotonic_mapping_inv when opening is the same as
+/// monotonic_mapping during serialization.
+pub fn monotonic_map_column<C, T, Input, Output>(
+    from_column: C,
+    monotonic_mapping: T,
+) -> impl ColumnValues<Output>
+where
+    C: ColumnValues<Input>,
+    T: StrictlyMonotonicFn<Input, Output> + Send + Sync,
+    Input: PartialOrd + Debug + Send + Sync + Clone,
+    Output: PartialOrd + Debug + Send + Sync + Clone,
+{
+    MonotonicMappingColumn {
+        from_column,
+        monotonic_mapping,
+        _phantom: PhantomData,
+    }
+}
+
+impl<C, T, Input, Output> ColumnValues<Output> for MonotonicMappingColumn<C, T, Input>
+where
+    C: ColumnValues<Input>,
+    T: StrictlyMonotonicFn<Input, Output> + Send + Sync,
+    Input: PartialOrd + Send + Debug + Sync + Clone,
+    Output: PartialOrd + Send + Debug + Sync + Clone,
+{
+    #[inline(always)]
+    fn get_val(&self, idx: u32) -> Output {
+        let from_val = self.from_column.get_val(idx);
+        self.monotonic_mapping.mapping(from_val)
+    }
+
+    fn min_value(&self) -> Output {
+        let from_min_value = self.from_column.min_value();
+        self.monotonic_mapping.mapping(from_min_value)
+    }
+
+    fn max_value(&self) -> Output {
+        let from_max_value = self.from_column.max_value();
+        self.monotonic_mapping.mapping(from_max_value)
+    }
+
+    fn num_vals(&self) -> u32 {
+        self.from_column.num_vals()
+    }
+
+    fn iter(&self) -> Box<dyn Iterator<Item = Output> + '_> {
+        Box::new(
+            self.from_column
+                .iter()
+                .map(|el| self.monotonic_mapping.mapping(el)),
+        )
+    }
+
+    fn get_row_ids_for_value_range(
+        &self,
+        range: RangeInclusive<Output>,
+        doc_id_range: Range<u32>,
+        positions: &mut Vec<u32>,
+    ) {
+        self.from_column.get_row_ids_for_value_range(
+            self.monotonic_mapping.inverse(range.start().clone())
+                ..=self.monotonic_mapping.inverse(range.end().clone()),
+            doc_id_range,
+            positions,
+        )
+    }
+
+    // We voluntarily do not implement get_range as it yields a regression,
+    // and we do not have any specialized implementation anyway.
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use crate::column_values::monotonic_mapping::{
+        StrictlyMonotonicMappingInverter, StrictlyMonotonicMappingToInternal,
+    };
+    use crate::column_values::VecColumn;
+
+    #[test]
+    fn test_monotonic_mapping_iter() {
+        let vals: Vec<u64> = (0..100u64).map(|el| el * 10).collect();
+        let col = VecColumn::from(&vals);
+        let mapped = monotonic_map_column(
+            col,
+            StrictlyMonotonicMappingInverter::from(StrictlyMonotonicMappingToInternal::<i64>::new()),
+        );
+        let val_i64s: Vec<u64> = mapped.iter().collect();
+        for i in 0..100 {
+            assert_eq!(val_i64s[i as usize], mapped.get_val(i));
+        }
+    }
+}
--- a/columnar/src/column_values/monotonic_mapping.rs
+++ b/columnar/src/column_values/monotonic_mapping.rs
@@ -1,13 +1,14 @@
+use std::fmt::Debug;
 use std::marker::PhantomData;

-use fastdivide::DividerU64;
+use common::DateTime;

 use super::MonotonicallyMappableToU128;
 use crate::RowId;

 /// Monotonic maps a value to u64 value space.
 /// Monotonic mapping enables `PartialOrd` on u64 space without conversion to original space.
-pub trait MonotonicallyMappableToU64: 'static + PartialOrd + Copy + Send + Sync {
+pub trait MonotonicallyMappableToU64: 'static + PartialOrd + Debug + Copy + Send + Sync {
    /// Converts a value to u64.
    ///
    /// Internally all fast field values are encoded as u64.
@@ -111,65 +112,6 @@ where T: MonotonicallyMappableToU64
    }
 }

-/// Mapping dividing by  gcd and a base value.
-///
-/// The function is assumed to be only called on values divided by passed
-/// gcd value. (It is necessary for the function to be monotonic.)
-pub(crate) struct StrictlyMonotonicMappingToInternalGCDBaseval {
-    gcd_divider: DividerU64,
-    gcd: u64,
-    min_value: u64,
-}
-impl StrictlyMonotonicMappingToInternalGCDBaseval {
-    pub(crate) fn new(gcd: u64, min_value: u64) -> Self {
-        let gcd_divider = DividerU64::divide_by(gcd);
-        Self {
-            gcd_divider,
-            gcd,
-            min_value,
-        }
-    }
-}
-impl<External: MonotonicallyMappableToU64> StrictlyMonotonicFn<External, u64>
-    for StrictlyMonotonicMappingToInternalGCDBaseval
-{
-    #[inline(always)]
-    fn mapping(&self, inp: External) -> u64 {
-        self.gcd_divider
-            .divide(External::to_u64(inp) - self.min_value)
-    }
-
-    #[inline(always)]
-    fn inverse(&self, out: u64) -> External {
-        External::from_u64(self.min_value + out * self.gcd)
-    }
-}
-
-/// Strictly monotonic mapping with a base value.
-pub(crate) struct StrictlyMonotonicMappingToInternalBaseval {
-    min_value: u64,
-}
-impl StrictlyMonotonicMappingToInternalBaseval {
-    #[inline(always)]
-    pub(crate) fn new(min_value: u64) -> Self {
-        Self { min_value }
-    }
-}
-
-impl<External: MonotonicallyMappableToU64> StrictlyMonotonicFn<External, u64>
-    for StrictlyMonotonicMappingToInternalBaseval
-{
-    #[inline(always)]
-    fn mapping(&self, val: External) -> u64 {
-        External::to_u64(val) - self.min_value
-    }
-
-    #[inline(always)]
-    fn inverse(&self, val: u64) -> External {
-        External::from_u64(self.min_value + val)
-    }
-}
-
 impl MonotonicallyMappableToU64 for u64 {
    #[inline(always)]
    fn to_u64(self) -> u64 {
@@ -194,17 +136,15 @@ impl MonotonicallyMappableToU64 for i64 {
    }
 }

-impl MonotonicallyMappableToU64 for crate::DateTime {
+impl MonotonicallyMappableToU64 for DateTime {
    #[inline(always)]
    fn to_u64(self) -> u64 {
-        common::i64_to_u64(self.timestamp_micros)
+        common::i64_to_u64(self.into_timestamp_nanos())
    }

    #[inline(always)]
    fn from_u64(val: u64) -> Self {
-        crate::DateTime {
-            timestamp_micros: common::u64_to_i64(val),
-        }
+        DateTime::from_timestamp_nanos(common::u64_to_i64(val))
    }
 }

@@ -260,13 +200,6 @@ mod tests {
        // TODO
        // identity mapping
        // test_round_trip(&StrictlyMonotonicMappingToInternal::<u128>::new(), 100u128);
-
-        // base value to i64 round trip
-        let mapping = StrictlyMonotonicMappingToInternalBaseval::new(100);
-        test_round_trip::<_, _, u64>(&mapping, 100i64);
-        // base value and gcd to u64 round trip
-        let mapping = StrictlyMonotonicMappingToInternalGCDBaseval::new(10, 100);
-        test_round_trip::<_, _, u64>(&mapping, 100u64);
    }

    fn test_round_trip<T: StrictlyMonotonicFn<K, L>, K: std::fmt::Debug + Eq + Copy, L>(
--- a/columnar/src/column_values/monotonic_mapping_u128.rs
+++ b/columnar/src/column_values/monotonic_mapping_u128.rs
@@ -1,8 +1,9 @@
+use std::fmt::Debug;
 use std::net::Ipv6Addr;

 /// Montonic maps a value to u128 value space
 /// Monotonic mapping enables `PartialOrd` on u128 space without conversion to original space.
-pub trait MonotonicallyMappableToU128: 'static + PartialOrd + Copy + Send + Sync {
+pub trait MonotonicallyMappableToU128: 'static + PartialOrd + Copy + Debug + Send + Sync {
    /// Converts a value to u128.
    ///
    /// Internally all fast field values are encoded as u64.
--- a/columnar/src/column_values/serialize.rs
+++ b/columnar/src/column_values/serialize.rs
@@ -1,320 +0,0 @@
-// Copyright (C) 2022 Quickwit, Inc.
-//
-// Quickwit is offered under the AGPL v3.0 and as commercial software.
-// For commercial licensing, contact us at hello@quickwit.io.
-//
-// AGPL:
-// This program is free software: you can redistribute it and/or modify
-// it under the terms of the GNU Affero General Public License as
-// published by the Free Software Foundation, either version 3 of the
-// License, or (at your option) any later version.
-//
-// This program is distributed in the hope that it will be useful,
-// but WITHOUT ANY WARRANTY; without even the implied warranty of
-// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
-// GNU Affero General Public License for more details.
-//
-// You should have received a copy of the GNU Affero General Public License
-// along with this program. If not, see <http://www.gnu.org/licenses/>.
-
-use std::io;
-use std::num::NonZeroU64;
-
-use common::{BinarySerializable, VInt};
-use log::warn;
-
-use super::bitpacked::BitpackedCodec;
-use super::blockwise_linear::BlockwiseLinearCodec;
-use super::linear::LinearCodec;
-use super::monotonic_mapping::{
-    StrictlyMonotonicFn, StrictlyMonotonicMappingToInternal,
-    StrictlyMonotonicMappingToInternalGCDBaseval,
-};
-use super::{
-    monotonic_map_column, ColumnValues, FastFieldCodec, FastFieldCodecType,
-    MonotonicallyMappableToU64, U128FastFieldCodecType,
-};
-use crate::column_values::compact_space::CompactSpaceCompressor;
-
-/// The normalized header gives some parameters after applying the following
-/// normalization of the vector:
-/// `val -> (val - min_value) / gcd`
-///
-/// By design, after normalization, `min_value = 0` and `gcd = 1`.
-#[derive(Debug, Copy, Clone)]
-pub struct NormalizedHeader {
-    /// The number of values in the underlying column.
-    pub num_vals: u32,
-    /// The max value of the underlying column.
-    pub max_value: u64,
-}
-
-#[derive(Debug, Copy, Clone)]
-pub(crate) struct Header {
-    pub num_vals: u32,
-    pub min_value: u64,
-    pub max_value: u64,
-    pub gcd: Option<NonZeroU64>,
-    pub codec_type: FastFieldCodecType,
-}
-
-impl Header {
-    pub fn normalized(self) -> NormalizedHeader {
-        let gcd = self.gcd.map(|gcd| gcd.get()).unwrap_or(1);
-        let gcd_min_val_mapping =
-            StrictlyMonotonicMappingToInternalGCDBaseval::new(gcd, self.min_value);
-
-        let max_value = gcd_min_val_mapping.mapping(self.max_value);
-        NormalizedHeader {
-            num_vals: self.num_vals,
-            max_value,
-        }
-    }
-
-    pub(crate) fn normalize_column<C: ColumnValues>(&self, from_column: C) -> impl ColumnValues {
-        normalize_column(from_column, self.min_value, self.gcd)
-    }
-
-    pub fn compute_header(
-        column: impl ColumnValues<u64>,
-        codecs: &[FastFieldCodecType],
-    ) -> Option<Header> {
-        let num_vals = column.num_vals();
-        let min_value = column.min_value();
-        let max_value = column.max_value();
-        let gcd = super::gcd::find_gcd(column.iter().map(|val| val - min_value))
-            .filter(|gcd| gcd.get() > 1u64);
-        let normalized_column = normalize_column(column, min_value, gcd);
-        let codec_type = detect_codec(normalized_column, codecs)?;
-        Some(Header {
-            num_vals,
-            min_value,
-            max_value,
-            gcd,
-            codec_type,
-        })
-    }
-}
-
-#[derive(Debug, Copy, Clone, PartialEq, Eq)]
-pub(crate) struct U128Header {
-    pub num_vals: u32,
-    pub codec_type: U128FastFieldCodecType,
-}
-
-impl BinarySerializable for U128Header {
-    fn serialize<W: io::Write>(&self, writer: &mut W) -> io::Result<()> {
-        VInt(self.num_vals as u64).serialize(writer)?;
-        self.codec_type.serialize(writer)?;
-        Ok(())
-    }
-
-    fn deserialize<R: io::Read>(reader: &mut R) -> io::Result<Self> {
-        let num_vals = VInt::deserialize(reader)?.0 as u32;
-        let codec_type = U128FastFieldCodecType::deserialize(reader)?;
-        Ok(U128Header {
-            num_vals,
-            codec_type,
-        })
-    }
-}
-
-fn normalize_column<C: ColumnValues>(
-    from_column: C,
-    min_value: u64,
-    gcd: Option<NonZeroU64>,
-) -> impl ColumnValues {
-    let gcd = gcd.map(|gcd| gcd.get()).unwrap_or(1);
-    let mapping = StrictlyMonotonicMappingToInternalGCDBaseval::new(gcd, min_value);
-    monotonic_map_column(from_column, mapping)
-}
-
-impl BinarySerializable for Header {
-    fn serialize<W: io::Write>(&self, writer: &mut W) -> io::Result<()> {
-        VInt(self.num_vals as u64).serialize(writer)?;
-        VInt(self.min_value).serialize(writer)?;
-        VInt(self.max_value - self.min_value).serialize(writer)?;
-        if let Some(gcd) = self.gcd {
-            VInt(gcd.get()).serialize(writer)?;
-        } else {
-            VInt(0u64).serialize(writer)?;
-        }
-        self.codec_type.serialize(writer)?;
-        Ok(())
-    }
-
-    fn deserialize<R: io::Read>(reader: &mut R) -> io::Result<Self> {
-        let num_vals = VInt::deserialize(reader)?.0 as u32;
-        let min_value = VInt::deserialize(reader)?.0;
-        let amplitude = VInt::deserialize(reader)?.0;
-        let max_value = min_value + amplitude;
-        let gcd_u64 = VInt::deserialize(reader)?.0;
-        let codec_type = FastFieldCodecType::deserialize(reader)?;
-        Ok(Header {
-            num_vals,
-            min_value,
-            max_value,
-            gcd: NonZeroU64::new(gcd_u64),
-            codec_type,
-        })
-    }
-}
-
-/// Serializes u128 values with the compact space codec.
-pub fn serialize_column_values_u128<F: Fn() -> I, I: Iterator<Item = u128>>(
-    iter_gen: F,
-    num_vals: u32,
-    output: &mut impl io::Write,
-) -> io::Result<()> {
-    let header = U128Header {
-        num_vals,
-        codec_type: U128FastFieldCodecType::CompactSpace,
-    };
-    header.serialize(output)?;
-    let compressor = CompactSpaceCompressor::train_from(iter_gen(), num_vals);
-    compressor.compress_into(iter_gen(), output)?;
-
-    Ok(())
-}
-
-/// Serializes the column with the codec with the best estimate on the data.
-pub fn serialize_column_values<T: MonotonicallyMappableToU64>(
-    typed_column: impl ColumnValues<T>,
-    codecs: &[FastFieldCodecType],
-    output: &mut impl io::Write,
-) -> io::Result<()> {
-    let column = monotonic_map_column(typed_column, StrictlyMonotonicMappingToInternal::<T>::new());
-    let header = Header::compute_header(&column, codecs).ok_or_else(|| {
-        io::Error::new(
-            io::ErrorKind::InvalidInput,
-            format!(
-                "Data cannot be serialized with this list of codec. {:?}",
-                codecs
-            ),
-        )
-    })?;
-    header.serialize(output)?;
-    let normalized_column = header.normalize_column(column);
-    assert_eq!(normalized_column.min_value(), 0u64);
-    serialize_given_codec(normalized_column, header.codec_type, output)?;
-    Ok(())
-}
-
-fn detect_codec(
-    column: impl ColumnValues<u64>,
-    codecs: &[FastFieldCodecType],
-) -> Option<FastFieldCodecType> {
-    let mut estimations = Vec::new();
-    for &codec in codecs {
-        let estimation_opt = match codec {
-            FastFieldCodecType::Bitpacked => BitpackedCodec::estimate(&column),
-            FastFieldCodecType::Linear => LinearCodec::estimate(&column),
-            FastFieldCodecType::BlockwiseLinear => BlockwiseLinearCodec::estimate(&column),
-        };
-        if let Some(estimation) = estimation_opt {
-            estimations.push((estimation, codec));
-        }
-    }
-    if let Some(broken_estimation) = estimations.iter().find(|estimation| estimation.0.is_nan()) {
-        warn!(
-            "broken estimation for fast field codec {:?}",
-            broken_estimation.1
-        );
-    }
-    // removing nan values for codecs with broken calculations, and max values which disables
-    // codecs
-    estimations.retain(|estimation| !estimation.0.is_nan() && estimation.0 != f32::MAX);
-    estimations.sort_by(|(score_left, _), (score_right, _)| score_left.total_cmp(score_right));
-    Some(estimations.first()?.1)
-}
-
-pub(crate) fn serialize_given_codec(
-    column: impl ColumnValues<u64>,
-    codec_type: FastFieldCodecType,
-    output: &mut impl io::Write,
-) -> io::Result<()> {
-    match codec_type {
-        FastFieldCodecType::Bitpacked => {
-            BitpackedCodec::serialize(&column, output)?;
-        }
-        FastFieldCodecType::Linear => {
-            LinearCodec::serialize(&column, output)?;
-        }
-        FastFieldCodecType::BlockwiseLinear => {
-            BlockwiseLinearCodec::serialize(&column, output)?;
-        }
-    }
-    Ok(())
-}
-
-#[cfg(test)]
-pub mod tests {
-    use std::sync::Arc;
-
-    use common::OwnedBytes;
-
-    use super::*;
-    use crate::column_values::{open_u64_mapped, VecColumn};
-
-    const ALL_CODEC_TYPES: [FastFieldCodecType; 3] = [
-        FastFieldCodecType::Bitpacked,
-        FastFieldCodecType::Linear,
-        FastFieldCodecType::BlockwiseLinear,
-    ];
-
-    /// Helper function to serialize a column (autodetect from all codecs) and then open it
-    pub fn serialize_and_load<T: MonotonicallyMappableToU64 + Ord + Default>(
-        column: &[T],
-    ) -> Arc<dyn ColumnValues<T>> {
-        let mut buffer = Vec::new();
-        serialize_column_values(&VecColumn::from(&column), &ALL_CODEC_TYPES, &mut buffer).unwrap();
-        open_u64_mapped(OwnedBytes::new(buffer)).unwrap()
-    }
-    #[test]
-    fn test_serialize_deserialize_u128_header() {
-        let original = U128Header {
-            num_vals: 11,
-            codec_type: U128FastFieldCodecType::CompactSpace,
-        };
-        let mut out = Vec::new();
-        original.serialize(&mut out).unwrap();
-        let restored = U128Header::deserialize(&mut &out[..]).unwrap();
-        assert_eq!(restored, original);
-    }
-
-    #[test]
-    fn test_serialize_deserialize() {
-        let original = [1u64, 5u64, 10u64];
-        let restored: Vec<u64> = serialize_and_load(&original[..]).iter().collect();
-        assert_eq!(&restored, &original[..]);
-    }
-
-    #[test]
-    fn test_fastfield_bool_size_bitwidth_1() {
-        let mut buffer = Vec::new();
-        let col = VecColumn::from(&[false, true][..]);
-        serialize_column_values(&col, &ALL_CODEC_TYPES, &mut buffer).unwrap();
-        // TODO put the header as a footer so that it serves as a padding.
-        // 5 bytes of header, 1 byte of value, 7 bytes of padding.
-        assert_eq!(buffer.len(), 5 + 1);
-    }
-
-    #[test]
-    fn test_fastfield_bool_bit_size_bitwidth_0() {
-        let mut buffer = Vec::new();
-        let col = VecColumn::from(&[true][..]);
-        serialize_column_values(&col, &ALL_CODEC_TYPES, &mut buffer).unwrap();
-        // 5 bytes of header, 0 bytes of value, 7 bytes of padding.
-        assert_eq!(buffer.len(), 5);
-    }
-
-    #[test]
-    fn test_fastfield_gcd() {
-        let mut buffer = Vec::new();
-        let vals: Vec<u64> = (0..80).map(|val| (val % 7) * 1_000u64).collect();
-        let col = VecColumn::from(&vals[..]);
-        serialize_column_values(&col, &[FastFieldCodecType::Bitpacked], &mut buffer).unwrap();
-        // Values are stored over 3 bits.
-        assert_eq!(buffer.len(), 7 + (3 * 80 / 8));
-    }
-}
--- a/columnar/src/column_values/stats.rs
+++ b/columnar/src/column_values/stats.rs
@@ -0,0 +1,103 @@
+use std::io;
+use std::io::Write;
+use std::num::NonZeroU64;
+
+use common::{BinarySerializable, VInt};
+
+use crate::RowId;
+
+/// Column statistics.
+#[derive(Debug, Clone, Eq, PartialEq)]
+pub struct ColumnStats {
+    /// GCD of the elements `el - min(column)`.
+    pub gcd: NonZeroU64,
+    /// Minimum value of the column.
+    pub min_value: u64,
+    /// Maximum value of the column.
+    pub max_value: u64,
+    /// Number of rows in the column.
+    pub num_rows: RowId,
+}
+
+impl ColumnStats {
+    /// Amplitude of value.
+    /// Difference between the maximum and the minimum value.
+    pub fn amplitude(&self) -> u64 {
+        self.max_value - self.min_value
+    }
+}
+
+impl BinarySerializable for ColumnStats {
+    fn serialize<W: Write + ?Sized>(&self, writer: &mut W) -> io::Result<()> {
+        VInt(self.min_value).serialize(writer)?;
+        VInt(self.gcd.get()).serialize(writer)?;
+        VInt(self.amplitude() / self.gcd).serialize(writer)?;
+        VInt(self.num_rows as u64).serialize(writer)?;
+        Ok(())
+    }
+
+    fn deserialize<R: io::Read>(reader: &mut R) -> io::Result<Self> {
+        let min_value = VInt::deserialize(reader)?.0;
+        let gcd = VInt::deserialize(reader)?.0;
+        let gcd = NonZeroU64::new(gcd)
+            .ok_or_else(|| io::Error::new(io::ErrorKind::InvalidData, "GCD of 0 is forbidden"))?;
+        let amplitude = VInt::deserialize(reader)?.0 * gcd.get();
+        let max_value = min_value + amplitude;
+        let num_rows = VInt::deserialize(reader)?.0 as RowId;
+        Ok(ColumnStats {
+            min_value,
+            max_value,
+            num_rows,
+            gcd,
+        })
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use std::num::NonZeroU64;
+
+    use common::BinarySerializable;
+
+    use crate::column_values::ColumnStats;
+
+    #[track_caller]
+    fn test_stats_ser_deser_aux(stats: &ColumnStats, num_bytes: usize) {
+        let mut buffer: Vec<u8> = Vec::new();
+        stats.serialize(&mut buffer).unwrap();
+        assert_eq!(buffer.len(), num_bytes);
+        let deser_stats = ColumnStats::deserialize(&mut &buffer[..]).unwrap();
+        assert_eq!(stats, &deser_stats);
+    }
+
+    #[test]
+    fn test_stats_serialization() {
+        test_stats_ser_deser_aux(
+            &(ColumnStats {
+                gcd: NonZeroU64::new(3).unwrap(),
+                min_value: 1,
+                max_value: 3001,
+                num_rows: 10,
+            }),
+            5,
+        );
+        test_stats_ser_deser_aux(
+            &(ColumnStats {
+                gcd: NonZeroU64::new(1_000).unwrap(),
+                min_value: 1,
+                max_value: 3001,
+                num_rows: 10,
+            }),
+            5,
+        );
+        test_stats_ser_deser_aux(
+            &(ColumnStats {
+                gcd: NonZeroU64::new(1).unwrap(),
+                min_value: 0,
+                max_value: 0,
+                num_rows: 0,
+            }),
+            4,
+        );
+    }
+}
--- a/columnar/src/column_values/tests.rs
+++ b/columnar/src/column_values/tests.rs
@@ -1,309 +0,0 @@
-use proptest::prelude::*;
-use proptest::strategy::Strategy;
-use proptest::{prop_oneof, proptest};
-
-use super::bitpacked::BitpackedCodec;
-use super::blockwise_linear::BlockwiseLinearCodec;
-use super::linear::LinearCodec;
-use super::serialize::Header;
-
-pub(crate) fn create_and_validate<Codec: FastFieldCodec>(
-    data: &[u64],
-    name: &str,
-) -> Option<(f32, f32)> {
-    let col = &VecColumn::from(data);
-    let header = Header::compute_header(col, &[Codec::CODEC_TYPE])?;
-    let normalized_col = header.normalize_column(col);
-    let estimation = Codec::estimate(&normalized_col)?;
-
-    let mut out = Vec::new();
-    let col = VecColumn::from(data);
-    serialize_column_values(&col, &[Codec::CODEC_TYPE], &mut out).unwrap();
-
-    let actual_compression = out.len() as f32 / (data.len() as f32 * 8.0);
-
-    let reader = super::open_u64_mapped::<u64>(OwnedBytes::new(out)).unwrap();
-    assert_eq!(reader.num_vals(), data.len() as u32);
-    for (doc, orig_val) in data.iter().copied().enumerate() {
-        let val = reader.get_val(doc as u32);
-        assert_eq!(
-            val, orig_val,
-            "val `{val}` does not match orig_val {orig_val:?}, in data set {name}, data `{data:?}`",
-        );
-    }
-
-    if !data.is_empty() {
-        let test_rand_idx = rand::thread_rng().gen_range(0..=data.len() - 1);
-        let expected_positions: Vec<u32> = data
-            .iter()
-            .enumerate()
-            .filter(|(_, el)| **el == data[test_rand_idx])
-            .map(|(pos, _)| pos as u32)
-            .collect();
-        let mut positions = Vec::new();
-        reader.get_docids_for_value_range(
-            data[test_rand_idx]..=data[test_rand_idx],
-            0..data.len() as u32,
-            &mut positions,
-        );
-        assert_eq!(expected_positions, positions);
-    }
-    Some((estimation, actual_compression))
-}
-
-proptest! {
-    #![proptest_config(ProptestConfig::with_cases(100))]
-
-    #[test]
-    fn test_proptest_small_bitpacked(data in proptest::collection::vec(num_strategy(), 1..10)) {
-        create_and_validate::<BitpackedCodec>(&data, "proptest bitpacked");
-    }
-
-    #[test]
-    fn test_proptest_small_linear(data in proptest::collection::vec(num_strategy(), 1..10)) {
-        create_and_validate::<LinearCodec>(&data, "proptest linearinterpol");
-    }
-
-    #[test]
-    fn test_proptest_small_blockwise_linear(data in proptest::collection::vec(num_strategy(), 1..10)) {
-        create_and_validate::<BlockwiseLinearCodec>(&data, "proptest multilinearinterpol");
-    }
-}
-
-proptest! {
-    #![proptest_config(ProptestConfig::with_cases(10))]
-
-    #[test]
-    fn test_proptest_large_bitpacked(data in proptest::collection::vec(num_strategy(), 1..6000)) {
-        create_and_validate::<BitpackedCodec>(&data, "proptest bitpacked");
-    }
-
-    #[test]
-    fn test_proptest_large_linear(data in proptest::collection::vec(num_strategy(), 1..6000)) {
-        create_and_validate::<LinearCodec>(&data, "proptest linearinterpol");
-    }
-
-    #[test]
-    fn test_proptest_large_blockwise_linear(data in proptest::collection::vec(num_strategy(), 1..6000)) {
-        create_and_validate::<BlockwiseLinearCodec>(&data, "proptest multilinearinterpol");
-    }
-}
-
-fn num_strategy() -> impl Strategy<Value = u64> {
-    prop_oneof![
-        1 => prop::num::u64::ANY.prop_map(|num| u64::MAX - (num % 10) ),
-        1 => prop::num::u64::ANY.prop_map(|num| num % 10 ),
-        20 => prop::num::u64::ANY,
-    ]
-}
-
-pub fn get_codec_test_datasets() -> Vec<(Vec<u64>, &'static str)> {
-    let mut data_and_names = vec![];
-
-    let data = (10..=10_000_u64).collect::<Vec<_>>();
-    data_and_names.push((data, "simple monotonically increasing"));
-
-    data_and_names.push((
-        vec![5, 6, 7, 8, 9, 10, 99, 100],
-        "offset in linear interpol",
-    ));
-    data_and_names.push((vec![5, 50, 3, 13, 1, 1000, 35], "rand small"));
-    data_and_names.push((vec![10], "single value"));
-
-    data_and_names.push((
-        vec![1572656989877777, 1170935903116329, 720575940379279, 0],
-        "overflow error",
-    ));
-
-    data_and_names
-}
-
-fn test_codec<C: FastFieldCodec>() {
-    let codec_name = format!("{:?}", C::CODEC_TYPE);
-    for (data, dataset_name) in get_codec_test_datasets() {
-        let estimate_actual_opt: Option<(f32, f32)> =
-            tests::create_and_validate::<C>(&data, dataset_name);
-        let result = if let Some((estimate, actual)) = estimate_actual_opt {
-            format!("Estimate `{estimate}` Actual `{actual}`")
-        } else {
-            "Disabled".to_string()
-        };
-        println!("Codec {codec_name}, DataSet {dataset_name}, {result}");
-    }
-}
-#[test]
-fn test_codec_bitpacking() {
-    test_codec::<BitpackedCodec>();
-}
-#[test]
-fn test_codec_interpolation() {
-    test_codec::<LinearCodec>();
-}
-#[test]
-fn test_codec_multi_interpolation() {
-    test_codec::<BlockwiseLinearCodec>();
-}
-
-use super::*;
-
-#[test]
-fn estimation_good_interpolation_case() {
-    let data = (10..=20000_u64).collect::<Vec<_>>();
-    let data: VecColumn = data.as_slice().into();
-
-    let linear_interpol_estimation = LinearCodec::estimate(&data).unwrap();
-    assert_le!(linear_interpol_estimation, 0.01);
-
-    let multi_linear_interpol_estimation = BlockwiseLinearCodec::estimate(&data).unwrap();
-    assert_le!(multi_linear_interpol_estimation, 0.2);
-    assert_lt!(linear_interpol_estimation, multi_linear_interpol_estimation);
-
-    let bitpacked_estimation = BitpackedCodec::estimate(&data).unwrap();
-    assert_lt!(linear_interpol_estimation, bitpacked_estimation);
-}
-#[test]
-fn estimation_test_bad_interpolation_case() {
-    let data: &[u64] = &[200, 10, 10, 10, 10, 1000, 20];
-
-    let data: VecColumn = data.into();
-    let linear_interpol_estimation = LinearCodec::estimate(&data).unwrap();
-    assert_le!(linear_interpol_estimation, 0.34);
-
-    let bitpacked_estimation = BitpackedCodec::estimate(&data).unwrap();
-    assert_lt!(bitpacked_estimation, linear_interpol_estimation);
-}
-
-#[test]
-fn estimation_prefer_bitpacked() {
-    let data = VecColumn::from(&[10, 10, 10, 10]);
-    let linear_interpol_estimation = LinearCodec::estimate(&data).unwrap();
-    let bitpacked_estimation = BitpackedCodec::estimate(&data).unwrap();
-    assert_lt!(bitpacked_estimation, linear_interpol_estimation);
-}
-
-#[test]
-fn estimation_test_bad_interpolation_case_monotonically_increasing() {
-    let mut data: Vec<u64> = (201..=20000_u64).collect();
-    data.push(1_000_000);
-    let data: VecColumn = data.as_slice().into();
-
-    // in this case the linear interpolation can't in fact not be worse than bitpacking,
-    // but the estimator adds some threshold, which leads to estimated worse behavior
-    let linear_interpol_estimation = LinearCodec::estimate(&data).unwrap();
-    assert_le!(linear_interpol_estimation, 0.35);
-
-    let bitpacked_estimation = BitpackedCodec::estimate(&data).unwrap();
-    assert_le!(bitpacked_estimation, 0.32);
-    assert_le!(bitpacked_estimation, linear_interpol_estimation);
-}
-
-#[test]
-fn test_fast_field_codec_type_to_code() {
-    let mut count_codec = 0;
-    for code in 0..=255 {
-        if let Some(codec_type) = FastFieldCodecType::from_code(code) {
-            assert_eq!(codec_type.to_code(), code);
-            count_codec += 1;
-        }
-    }
-    assert_eq!(count_codec, 3);
-}
-
-fn test_fastfield_gcd_i64_with_codec(
-    codec_type: FastFieldCodecType,
-    num_vals: usize,
-) -> io::Result<()> {
-    let mut vals: Vec<i64> = (-4..=(num_vals as i64) - 5).map(|val| val * 1000).collect();
-    let mut buffer: Vec<u8> = Vec::new();
-    crate::column_values::serialize_column_values(
-        &VecColumn::from(&vals),
-        &[codec_type],
-        &mut buffer,
-    )?;
-    let buffer = OwnedBytes::new(buffer);
-    let column = crate::column_values::open_u64_mapped::<i64>(buffer.clone())?;
-    assert_eq!(column.get_val(0), -4000i64);
-    assert_eq!(column.get_val(1), -3000i64);
-    assert_eq!(column.get_val(2), -2000i64);
-    assert_eq!(column.max_value(), (num_vals as i64 - 5) * 1000);
-    assert_eq!(column.min_value(), -4000i64);
-
-    // Can't apply gcd
-    let mut buffer_without_gcd = Vec::new();
-    vals.pop();
-    vals.push(1001i64);
-    crate::column_values::serialize_column_values(
-        &VecColumn::from(&vals),
-        &[codec_type],
-        &mut buffer_without_gcd,
-    )?;
-    let buffer_without_gcd = OwnedBytes::new(buffer_without_gcd);
-    assert!(buffer_without_gcd.len() > buffer.len());
-
-    Ok(())
-}
-
-#[test]
-fn test_fastfield_gcd_i64() -> io::Result<()> {
-    for &codec_type in &[
-        FastFieldCodecType::Bitpacked,
-        FastFieldCodecType::BlockwiseLinear,
-        FastFieldCodecType::Linear,
-    ] {
-        test_fastfield_gcd_i64_with_codec(codec_type, 5500)?;
-    }
-    Ok(())
-}
-
-fn test_fastfield_gcd_u64_with_codec(
-    codec_type: FastFieldCodecType,
-    num_vals: usize,
-) -> io::Result<()> {
-    let mut vals: Vec<u64> = (1..=num_vals).map(|i| i as u64 * 1000u64).collect();
-    let mut buffer: Vec<u8> = Vec::new();
-    crate::column_values::serialize_column_values(
-        &VecColumn::from(&vals),
-        &[codec_type],
-        &mut buffer,
-    )?;
-    let buffer = OwnedBytes::new(buffer);
-    let column = crate::column_values::open_u64_mapped::<u64>(buffer.clone())?;
-    assert_eq!(column.get_val(0), 1000u64);
-    assert_eq!(column.get_val(1), 2000u64);
-    assert_eq!(column.get_val(2), 3000u64);
-    assert_eq!(column.max_value(), num_vals as u64 * 1000);
-    assert_eq!(column.min_value(), 1000u64);
-
-    // Can't apply gcd
-    let mut buffer_without_gcd = Vec::new();
-    vals.pop();
-    vals.push(1001u64);
-    crate::column_values::serialize_column_values(
-        &VecColumn::from(&vals),
-        &[codec_type],
-        &mut buffer_without_gcd,
-    )?;
-    let buffer_without_gcd = OwnedBytes::new(buffer_without_gcd);
-    assert!(buffer_without_gcd.len() > buffer.len());
-    Ok(())
-}
-
-#[test]
-fn test_fastfield_gcd_u64() -> io::Result<()> {
-    for &codec_type in &[
-        FastFieldCodecType::Bitpacked,
-        FastFieldCodecType::BlockwiseLinear,
-        FastFieldCodecType::Linear,
-    ] {
-        test_fastfield_gcd_u64_with_codec(codec_type, 5500)?;
-    }
-    Ok(())
-}
-
-#[test]
-pub fn test_fastfield2() {
-    let test_fastfield = crate::column_values::serialize_and_load(&[100u64, 200u64, 300u64]);
-    assert_eq!(test_fastfield.get_val(0), 100);
-    assert_eq!(test_fastfield.get_val(1), 200);
-    assert_eq!(test_fastfield.get_val(2), 300);
-}
--- a/columnar/src/column_values/u128_based/compact_space/blank_range.rs
+++ b/columnar/src/column_values/u128_based/compact_space/blank_range.rs
@@ -38,6 +38,6 @@ impl Ord for BlankRange {
 }
 impl PartialOrd for BlankRange {
    fn partial_cmp(&self, other: &Self) -> Option<std::cmp::Ordering> {
-        Some(self.blank_size().cmp(&other.blank_size()))
+        Some(self.cmp(other))
    }
 }
--- a/columnar/src/column_values/u128_based/compact_space/build_compact_space.rs
+++ b/columnar/src/column_values/u128_based/compact_space/build_compact_space.rs
@@ -10,7 +10,7 @@ use super::{CompactSpace, RangeMapping};
 /// Put the blanks for the sorted values into a binary heap
 fn get_blanks(values_sorted: &BTreeSet<u128>) -> BinaryHeap<BlankRange> {
    let mut blanks: BinaryHeap<BlankRange> = BinaryHeap::new();
-    for (first, second) in values_sorted.iter().tuple_windows() {
+    for (first, second) in values_sorted.iter().copied().tuple_windows() {
        // Correctness Overflow: the values are deduped and sorted (BTreeSet property), that means
        // there's always space between two values.
        let blank_range = first + 1..=second - 1;
@@ -65,12 +65,12 @@ pub fn get_compact_space(
        return compact_space_builder.finish();
    }

-    let mut blanks: BinaryHeap<BlankRange> = get_blanks(values_deduped_sorted);
-    // Replace after stabilization of https://github.com/rust-lang/rust/issues/62924
-
    // We start by space that's limited to min_value..=max_value
-    let min_value = *values_deduped_sorted.iter().next().unwrap_or(&0);
-    let max_value = *values_deduped_sorted.iter().last().unwrap_or(&0);
+    // Replace after stabilization of https://github.com/rust-lang/rust/issues/62924
+    let min_value = values_deduped_sorted.iter().next().copied().unwrap_or(0);
+    let max_value = values_deduped_sorted.iter().last().copied().unwrap_or(0);
+
+    let mut blanks: BinaryHeap<BlankRange> = get_blanks(values_deduped_sorted);

    // +1 for null, in case min and max covers the whole space, we are off by one.
    let mut amplitude_compact_space = (max_value - min_value).saturating_add(1);
@@ -84,6 +84,7 @@ pub fn get_compact_space(
    let mut amplitude_bits: u8 = num_bits(amplitude_compact_space);

    let mut blank_collector = BlankCollector::new();
+
    // We will stage blanks until they reduce the compact space by at least 1 bit and then flush
    // them if the metadata cost is lower than the total number of saved bits.
    // Binary heap to process the gaps by their size
@@ -93,6 +94,7 @@ pub fn get_compact_space(
        let staged_spaces_sum: u128 = blank_collector.staged_blanks_sum();
        let amplitude_new_compact_space = amplitude_compact_space - staged_spaces_sum;
        let amplitude_new_bits = num_bits(amplitude_new_compact_space);
+
        if amplitude_bits == amplitude_new_bits {
            continue;
        }
@@ -100,7 +102,16 @@ pub fn get_compact_space(
        // TODO: Maybe calculate exact cost of blanks and run this more expensive computation only,
        // when amplitude_new_bits changes
        let cost = blank_collector.num_staged_blanks() * cost_per_blank;
-        if cost >= saved_bits {
+
+        // We want to end up with a compact space that fits into 32 bits.
+        // In order to deal with pathological cases, we force the algorithm to keep
+        // refining the compact space the amplitude bits is lower than 32.
+        //
+        // The worst case scenario happens for a large number of u128s regularly
+        // spread over the full u128 space.
+        //
+        // This change will force the algorithm to degenerate into dictionary encoding.
+        if amplitude_bits <= 32 && cost >= saved_bits {
            // Continue here, since although we walk over the blanks by size,
            // we can potentially save a lot at the last bits, which are smaller blanks
            //
@@ -115,6 +126,8 @@ pub fn get_compact_space(
        compact_space_builder.add_blanks(blank_collector.drain().map(|blank| blank.blank_range()));
    }

+    assert!(amplitude_bits <= 32);
+
    // special case, when we don't collected any blanks because:
    // * the data is empty (early exit)
    // * the algorithm did decide it's not worth the cost, which can be the case for single values
@@ -199,7 +212,7 @@ impl CompactSpaceBuilder {
            covered_space.push(0..=0); // empty data case
        };

-        let mut compact_start: u64 = 1; // 0 is reserved for `null`
+        let mut compact_start: u32 = 1; // 0 is reserved for `null`
        let mut ranges_mapping: Vec<RangeMapping> = Vec::with_capacity(covered_space.len());
        for cov in covered_space {
            let range_mapping = super::RangeMapping {
@@ -218,6 +231,7 @@ impl CompactSpaceBuilder {
 #[cfg(test)]
 mod tests {
    use super::*;
+    use crate::column_values::u128_based::compact_space::COST_PER_BLANK_IN_BITS;

    #[test]
    fn test_binary_heap_pop_order() {
@@ -228,4 +242,11 @@ mod tests {
        assert_eq!(blanks.pop().unwrap().blank_size(), 101);
        assert_eq!(blanks.pop().unwrap().blank_size(), 11);
    }
+
+    #[test]
+    fn test_worst_case_scenario() {
+        let vals: BTreeSet<u128> = (0..8).map(|i| i * ((1u128 << 34) / 8)).collect();
+        let compact_space = get_compact_space(&vals, vals.len() as u32, COST_PER_BLANK_IN_BITS);
+        assert!(compact_space.amplitude_compact_space() < u32::MAX as u128);
+    }
 }
--- a/columnar/src/column_values/u128_based/compact_space/mod.rs
+++ b/columnar/src/column_values/u128_based/compact_space/mod.rs
@@ -17,14 +17,15 @@ use std::{
    ops::{Range, RangeInclusive},
 };

+mod blank_range;
+mod build_compact_space;
+
+use build_compact_space::get_compact_space;
 use common::{BinarySerializable, CountingWriter, OwnedBytes, VInt, VIntU128};
 use tantivy_bitpacker::{self, BitPacker, BitUnpacker};

-use crate::compact_space::build_compact_space::get_compact_space;
-use crate::Column;
-
-mod blank_range;
-mod build_compact_space;
+use crate::column_values::ColumnValues;
+use crate::RowId;

 /// The cost per blank is quite hard actually, since blanks are delta encoded, the actual cost of
 /// blanks depends on the number of blanks.
@@ -41,21 +42,21 @@ pub struct CompactSpace {
 #[derive(Debug, Clone, Eq, PartialEq)]
 struct RangeMapping {
    value_range: RangeInclusive<u128>,
-    compact_start: u64,
+    compact_start: u32,
 }
 impl RangeMapping {
-    fn range_length(&self) -> u64 {
-        (self.value_range.end() - self.value_range.start()) as u64 + 1
+    fn range_length(&self) -> u32 {
+        (self.value_range.end() - self.value_range.start()) as u32 + 1
    }

    // The last value of the compact space in this range
-    fn compact_end(&self) -> u64 {
+    fn compact_end(&self) -> u32 {
        self.compact_start + self.range_length() - 1
    }
 }

 impl BinarySerializable for CompactSpace {
-    fn serialize<W: io::Write>(&self, writer: &mut W) -> io::Result<()> {
+    fn serialize<W: io::Write + ?Sized>(&self, writer: &mut W) -> io::Result<()> {
        VInt(self.ranges_mapping.len() as u64).serialize(writer)?;

        let mut prev_value = 0;
@@ -80,7 +81,7 @@ impl BinarySerializable for CompactSpace {
        let num_ranges = VInt::deserialize(reader)?.0;
        let mut ranges_mapping: Vec<RangeMapping> = vec![];
        let mut value = 0u128;
-        let mut compact_start = 1u64; // 0 is reserved for `null`
+        let mut compact_start = 1u32; // 0 is reserved for `null`
        for _ in 0..num_ranges {
            let blank_delta_start = VIntU128::deserialize(reader)?.0;
            value += blank_delta_start;
@@ -121,10 +122,10 @@ impl CompactSpace {

    /// Returns either Ok(the value in the compact space) or if it is outside the compact space the
    /// Err(position where it would be inserted)
-    fn u128_to_compact(&self, value: u128) -> Result<u64, usize> {
+    fn u128_to_compact(&self, value: u128) -> Result<u32, usize> {
        self.ranges_mapping
            .binary_search_by(|probe| {
-                let value_range = &probe.value_range;
+                let value_range: &RangeInclusive<u128> = &probe.value_range;
                if value < *value_range.start() {
                    Ordering::Greater
                } else if value > *value_range.end() {
@@ -135,13 +136,13 @@ impl CompactSpace {
            })
            .map(|pos| {
                let range_mapping = &self.ranges_mapping[pos];
-                let pos_in_range = (value - range_mapping.value_range.start()) as u64;
+                let pos_in_range: u32 = (value - range_mapping.value_range.start()) as u32;
                range_mapping.compact_start + pos_in_range
            })
    }

-    /// Unpacks a value from compact space u64 to u128 space
-    fn compact_to_u128(&self, compact: u64) -> u128 {
+    /// Unpacks a value from compact space u32 to u128 space
+    fn compact_to_u128(&self, compact: u32) -> u128 {
        let pos = self
            .ranges_mapping
            .binary_search_by_key(&compact, |range_mapping| range_mapping.compact_start)
@@ -158,22 +159,33 @@ impl CompactSpace {
 pub struct CompactSpaceCompressor {
    params: IPCodecParams,
 }
+
 #[derive(Debug, Clone)]
 pub struct IPCodecParams {
    compact_space: CompactSpace,
    bit_unpacker: BitUnpacker,
    min_value: u128,
    max_value: u128,
-    num_vals: u32,
+    num_vals: RowId,
    num_bits: u8,
 }

 impl CompactSpaceCompressor {
+    pub fn num_vals(&self) -> RowId {
+        self.params.num_vals
+    }
+
    /// Taking the vals as Vec may cost a lot of memory. It is used to sort the vals.
-    pub fn train_from(iter: impl Iterator<Item = u128>, num_vals: u32) -> Self {
+    pub fn train_from(iter: impl Iterator<Item = u128>) -> Self {
        let mut values_sorted = BTreeSet::new();
-        values_sorted.extend(iter);
-        let total_num_values = num_vals;
+        // Total number of values, with their redundancy.
+        let mut total_num_values = 0u32;
+        for val in iter {
+            total_num_values += 1u32;
+            values_sorted.insert(val);
+        }
+        let min_value = *values_sorted.iter().next().unwrap_or(&0);
+        let max_value = *values_sorted.iter().last().unwrap_or(&0);

        let compact_space =
            get_compact_space(&values_sorted, total_num_values, COST_PER_BLANK_IN_BITS);
@@ -185,13 +197,12 @@ impl CompactSpaceCompressor {
        );

        let num_bits = tantivy_bitpacker::compute_num_bits(amplitude_compact_space as u64);
-        let min_value = *values_sorted.iter().next().unwrap_or(&0);
-        let max_value = *values_sorted.iter().last().unwrap_or(&0);
+
        assert_eq!(
            compact_space
                .u128_to_compact(max_value)
                .expect("could not convert max value to compact space"),
-            amplitude_compact_space as u64
+            amplitude_compact_space as u32
        );
        CompactSpaceCompressor {
            params: IPCodecParams {
@@ -232,7 +243,7 @@ impl CompactSpaceCompressor {
                        "Could not convert value to compact_space. This is a bug.",
                    )
                })?;
-            bitpacker.write(compact, self.params.num_bits, write)?;
+            bitpacker.write(compact as u64, self.params.num_bits, write)?;
        }
        bitpacker.close(write)?;
        self.write_footer(write)?;
@@ -247,7 +258,7 @@ pub struct CompactSpaceDecompressor {
 }

 impl BinarySerializable for IPCodecParams {
-    fn serialize<W: io::Write>(&self, writer: &mut W) -> io::Result<()> {
+    fn serialize<W: io::Write + ?Sized>(&self, writer: &mut W) -> io::Result<()> {
        // header flags for future optional dictionary encoding
        let footer_flags = 0u64;
        footer_flags.serialize(writer)?;
@@ -281,7 +292,7 @@ impl BinarySerializable for IPCodecParams {
    }
 }

-impl Column<u128> for CompactSpaceDecompressor {
+impl ColumnValues<u128> for CompactSpaceDecompressor {
    #[inline]
    fn get_val(&self, doc: u32) -> u128 {
        self.get(doc)
@@ -305,49 +316,7 @@ impl Column<u128> for CompactSpaceDecompressor {
    }

    #[inline]
-    fn get_docids_for_value_range(
-        &self,
-        value_range: RangeInclusive<u128>,
-        positions_range: Range<u32>,
-        positions: &mut Vec<u32>,
-    ) {
-        self.get_positions_for_value_range(value_range, positions_range, positions)
-    }
-}
-
-impl CompactSpaceDecompressor {
-    pub fn open(data: OwnedBytes) -> io::Result<CompactSpaceDecompressor> {
-        let (data_slice, footer_len_bytes) = data.split_at(data.len() - 4);
-        let footer_len = u32::deserialize(&mut &footer_len_bytes[..])?;
-
-        let data_footer = &data_slice[data_slice.len() - footer_len as usize..];
-        let params = IPCodecParams::deserialize(&mut &data_footer[..])?;
-        let decompressor = CompactSpaceDecompressor { data, params };
-
-        Ok(decompressor)
-    }
-
-    /// Converting to compact space for the decompressor is more complex, since we may get values
-    /// which are outside the compact space. e.g. if we map
-    /// 1000 => 5
-    /// 2000 => 6
-    ///
-    /// and we want a mapping for 1005, there is no equivalent compact space. We instead return an
-    /// error with the index of the next range.
-    fn u128_to_compact(&self, value: u128) -> Result<u64, usize> {
-        self.params.compact_space.u128_to_compact(value)
-    }
-
-    fn compact_to_u128(&self, compact: u64) -> u128 {
-        self.params.compact_space.compact_to_u128(compact)
-    }
-
-    /// Comparing on compact space: Random dataset 0,24 (50% random hit) - 1.05 GElements/s
-    /// Comparing on compact space: Real dataset 1.08 GElements/s
-    ///
-    /// Comparing on original space: Real dataset .06 GElements/s (not completely optimized)
-    #[inline]
-    pub fn get_positions_for_value_range(
+    fn get_row_ids_for_value_range(
        &self,
        value_range: RangeInclusive<u128>,
        position_range: Range<u32>,
@@ -387,44 +356,42 @@ impl CompactSpaceDecompressor {
            range_mapping.compact_end()
        });

-        let range = compact_from..=compact_to;
+        let value_range = compact_from..=compact_to;
+        self.get_positions_for_compact_value_range(value_range, position_range, positions);
+    }
+}

-        let scan_num_docs = position_range.end - position_range.start;
+impl CompactSpaceDecompressor {
+    pub fn open(data: OwnedBytes) -> io::Result<CompactSpaceDecompressor> {
+        let (data_slice, footer_len_bytes) = data.split_at(data.len() - 4);
+        let footer_len = u32::deserialize(&mut &footer_len_bytes[..])?;

-        let step_size = 4;
-        let cutoff = position_range.start + scan_num_docs - scan_num_docs % step_size;
+        let data_footer = &data_slice[data_slice.len() - footer_len as usize..];
+        let params = IPCodecParams::deserialize(&mut &data_footer[..])?;
+        let decompressor = CompactSpaceDecompressor { data, params };

-        let mut push_if_in_range = |idx, val| {
-            if range.contains(&val) {
-                positions.push(idx);
-            }
-        };
-        let get_val = |idx| self.params.bit_unpacker.get(idx, &self.data);
-        // unrolled loop
-        for idx in (position_range.start..cutoff).step_by(step_size as usize) {
-            let idx1 = idx;
-            let idx2 = idx + 1;
-            let idx3 = idx + 2;
-            let idx4 = idx + 3;
-            let val1 = get_val(idx1);
-            let val2 = get_val(idx2);
-            let val3 = get_val(idx3);
-            let val4 = get_val(idx4);
-            push_if_in_range(idx1, val1);
-            push_if_in_range(idx2, val2);
-            push_if_in_range(idx3, val3);
-            push_if_in_range(idx4, val4);
-        }
+        Ok(decompressor)
+    }

-        // handle rest
-        for idx in cutoff..position_range.end {
-            push_if_in_range(idx, get_val(idx));
-        }
+    /// Converting to compact space for the decompressor is more complex, since we may get values
+    /// which are outside the compact space. e.g. if we map
+    /// 1000 => 5
+    /// 2000 => 6
+    ///
+    /// and we want a mapping for 1005, there is no equivalent compact space. We instead return an
+    /// error with the index of the next range.
+    fn u128_to_compact(&self, value: u128) -> Result<u32, usize> {
+        self.params.compact_space.u128_to_compact(value)
+    }
+
+    fn compact_to_u128(&self, compact: u32) -> u128 {
+        self.params.compact_space.compact_to_u128(compact)
    }

    #[inline]
-    fn iter_compact(&self) -> impl Iterator<Item = u64> + '_ {
-        (0..self.params.num_vals).map(move |idx| self.params.bit_unpacker.get(idx, &self.data))
+    fn iter_compact(&self) -> impl Iterator<Item = u32> + '_ {
+        (0..self.params.num_vals)
+            .map(move |idx| self.params.bit_unpacker.get(idx, &self.data) as u32)
    }

    #[inline]
@@ -437,7 +404,7 @@ impl CompactSpaceDecompressor {

    #[inline]
    pub fn get(&self, idx: u32) -> u128 {
-        let compact = self.params.bit_unpacker.get(idx, &self.data);
+        let compact = self.params.bit_unpacker.get(idx, &self.data) as u32;
        self.compact_to_u128(compact)
    }

@@ -448,27 +415,39 @@ impl CompactSpaceDecompressor {
    pub fn max_value(&self) -> u128 {
        self.params.max_value
    }
+
+    fn get_positions_for_compact_value_range(
+        &self,
+        value_range: RangeInclusive<u32>,
+        position_range: Range<u32>,
+        positions: &mut Vec<u32>,
+    ) {
+        self.params.bit_unpacker.get_ids_for_value_range(
+            *value_range.start() as u64..=*value_range.end() as u64,
+            position_range,
+            &self.data,
+            positions,
+        );
+    }
 }

 #[cfg(test)]
 mod tests {

-    use std::fmt;
+    use itertools::Itertools;

    use super::*;
-    use crate::format_version::read_format_version;
-    use crate::null_index_footer::read_null_index_footer;
-    use crate::serialize::U128Header;
-    use crate::{open_u128, serialize_u128};
+    use crate::column_values::u128_based::U128Header;
+    use crate::column_values::{open_u128_mapped, serialize_column_values_u128};

    #[test]
    fn compact_space_test() {
-        let ips = &[
+        let ips: BTreeSet<u128> = [
            2u128, 4u128, 1000, 1001, 1002, 1003, 1004, 1005, 1008, 1010, 1012, 1260,
        ]
        .into_iter()
        .collect();
-        let compact_space = get_compact_space(ips, ips.len() as u32, 11);
+        let compact_space = get_compact_space(&ips, ips.len() as u32, 11);
        let amplitude = compact_space.amplitude_compact_space();
        assert_eq!(amplitude, 17);
        assert_eq!(1, compact_space.u128_to_compact(2).unwrap());
@@ -491,8 +470,8 @@ mod tests {
        );

        for ip in ips {
-            let compact = compact_space.u128_to_compact(*ip).unwrap();
-            assert_eq!(compact_space.compact_to_u128(compact), *ip);
+            let compact = compact_space.u128_to_compact(ip).unwrap();
+            assert_eq!(compact_space.compact_to_u128(compact), ip);
        }
    }

@@ -518,7 +497,7 @@ mod tests {
                    .map(|pos| pos as u32)
                    .collect::<Vec<_>>();
                let mut positions = Vec::new();
-                decompressor.get_positions_for_value_range(
+                decompressor.get_row_ids_for_value_range(
                    range,
                    0..decompressor.num_vals(),
                    &mut positions,
@@ -535,18 +514,9 @@ mod tests {

    fn test_aux_vals(u128_vals: &[u128]) -> OwnedBytes {
        let mut out = Vec::new();
-        serialize_u128(
-            || u128_vals.iter().cloned(),
-            u128_vals.len() as u32,
-            &mut out,
-        )
-        .unwrap();
-
+        serialize_column_values_u128(&u128_vals, &mut out).unwrap();
        let data = OwnedBytes::new(out);
-        let (data, _format_version) = read_format_version(data).unwrap();
-        let (data, _null_index_footer) = read_null_index_footer(data).unwrap();
        test_all(data.clone(), u128_vals);
-
        data
    }

@@ -572,13 +542,13 @@ mod tests {
            let val = *val;
            let pos = pos as u32;
            let mut positions = Vec::new();
-            decomp.get_positions_for_value_range(val..=val, pos..pos + 1, &mut positions);
+            decomp.get_row_ids_for_value_range(val..=val, pos..pos + 1, &mut positions);
            assert_eq!(positions, vec![pos]);
        }

        // handle docid range out of bounds
-        let positions = get_positions_for_value_range_helper(&decomp, 0..=1, 1..u32::MAX);
-        assert_eq!(positions, vec![]);
+        let positions: Vec<u32> = get_positions_for_value_range_helper(&decomp, 0..=1, 1..u32::MAX);
+        assert!(positions.is_empty());

        let positions =
            get_positions_for_value_range_helper(&decomp, 0..=1, complete_range.clone());
@@ -614,61 +584,59 @@ mod tests {
            vec![3, 4]
        );
        assert_eq!(
-            get_positions_for_value_range_helper(
+            &get_positions_for_value_range_helper(
                &decomp,
                99998u128..=99999u128,
                complete_range.clone()
            ),
-            vec![3]
+            &[3]
        );
+        assert!(get_positions_for_value_range_helper(
+            &decomp,
+            99998u128..=99998u128,
+            complete_range.clone()
+        )
+        .is_empty());
        assert_eq!(
-            get_positions_for_value_range_helper(
-                &decomp,
-                99998u128..=99998u128,
-                complete_range.clone()
-            ),
-            vec![]
-        );
-        assert_eq!(
-            get_positions_for_value_range_helper(
+            &get_positions_for_value_range_helper(
                &decomp,
                333u128..=333u128,
                complete_range.clone()
            ),
-            vec![8]
+            &[8]
        );
        assert_eq!(
-            get_positions_for_value_range_helper(
+            &get_positions_for_value_range_helper(
                &decomp,
                332u128..=333u128,
                complete_range.clone()
            ),
-            vec![8]
+            &[8]
        );
        assert_eq!(
-            get_positions_for_value_range_helper(
+            &get_positions_for_value_range_helper(
                &decomp,
                332u128..=334u128,
                complete_range.clone()
            ),
-            vec![8]
+            &[8]
        );
        assert_eq!(
-            get_positions_for_value_range_helper(
+            &get_positions_for_value_range_helper(
                &decomp,
                333u128..=334u128,
                complete_range.clone()
            ),
-            vec![8]
+            &[8]
        );

        assert_eq!(
-            get_positions_for_value_range_helper(
+            &get_positions_for_value_range_helper(
                &decomp,
                4_000_211_221u128..=5_000_000_000u128,
                complete_range
            ),
-            vec![6, 7]
+            &[6, 7]
        );
    }

@@ -694,27 +662,27 @@ mod tests {
        let _header = U128Header::deserialize(&mut data);
        let decomp = CompactSpaceDecompressor::open(data).unwrap();
        let complete_range = 0..vals.len() as u32;
-        assert_eq!(
-            get_positions_for_value_range_helper(&decomp, 0..=5, complete_range.clone()),
-            vec![]
+        assert!(
+            &get_positions_for_value_range_helper(&decomp, 0..=5, complete_range.clone())
+                .is_empty(),
        );
        assert_eq!(
-            get_positions_for_value_range_helper(&decomp, 0..=100, complete_range.clone()),
-            vec![0]
+            &get_positions_for_value_range_helper(&decomp, 0..=100, complete_range.clone()),
+            &[0]
        );
        assert_eq!(
-            get_positions_for_value_range_helper(&decomp, 0..=105, complete_range),
-            vec![0]
+            &get_positions_for_value_range_helper(&decomp, 0..=105, complete_range),
+            &[0]
        );
    }

-    fn get_positions_for_value_range_helper<C: Column<T> + ?Sized, T: PartialOrd + fmt::Debug>(
+    fn get_positions_for_value_range_helper<C: ColumnValues<T> + ?Sized, T: PartialOrd>(
        column: &C,
        value_range: RangeInclusive<T>,
        doc_id_range: Range<u32>,
    ) -> Vec<u32> {
        let mut positions = Vec::new();
-        column.get_docids_for_value_range(value_range, doc_id_range, &mut positions);
+        column.get_row_ids_for_value_range(value_range, doc_id_range, &mut positions);
        positions
    }

@@ -736,8 +704,8 @@ mod tests {
            5_000_000_000,
        ];
        let mut out = Vec::new();
-        serialize_u128(|| vals.iter().cloned(), vals.len() as u32, &mut out).unwrap();
-        let decomp = open_u128::<u128>(OwnedBytes::new(out)).unwrap();
+        serialize_column_values_u128(&&vals[..], &mut out).unwrap();
+        let decomp = open_u128_mapped(OwnedBytes::new(out)).unwrap();
        let complete_range = 0..vals.len() as u32;

        assert_eq!(
@@ -790,7 +758,7 @@ mod tests {
        let vals = &[1_000_000_000u128; 100];
        let _data = test_aux_vals(vals);
    }
-    use itertools::Itertools;
+
    use proptest::prelude::*;

    fn num_strategy() -> impl Strategy<Value = u128> {
@@ -806,10 +774,9 @@ mod tests {
    proptest! {
        #![proptest_config(ProptestConfig::with_cases(10))]

-            #[test]
-            fn compress_decompress_random(vals in proptest::collection::vec(num_strategy()
-    , 1..1000)) {
-                let _data = test_aux_vals(&vals);
-            }
+        #[test]
+        fn compress_decompress_random(vals in proptest::collection::vec(num_strategy() , 1..1000)) {
+            let _data = test_aux_vals(&vals);
        }
+    }
 }
--- a/columnar/src/column_values/u128_based/mod.rs
+++ b/columnar/src/column_values/u128_based/mod.rs
@@ -0,0 +1,178 @@
+use std::fmt::Debug;
+use std::io;
+use std::io::Write;
+use std::sync::Arc;
+
+mod compact_space;
+
+use common::{BinarySerializable, OwnedBytes, VInt};
+use compact_space::{CompactSpaceCompressor, CompactSpaceDecompressor};
+
+use crate::column_values::monotonic_map_column;
+use crate::column_values::monotonic_mapping::{
+    StrictlyMonotonicMappingInverter, StrictlyMonotonicMappingToInternal,
+};
+use crate::iterable::Iterable;
+use crate::{ColumnValues, MonotonicallyMappableToU128};
+
+#[derive(Debug, Copy, Clone, PartialEq, Eq)]
+pub(crate) struct U128Header {
+    pub num_vals: u32,
+    pub codec_type: U128FastFieldCodecType,
+}
+
+impl BinarySerializable for U128Header {
+    fn serialize<W: io::Write + ?Sized>(&self, writer: &mut W) -> io::Result<()> {
+        VInt(self.num_vals as u64).serialize(writer)?;
+        self.codec_type.serialize(writer)?;
+        Ok(())
+    }
+
+    fn deserialize<R: io::Read>(reader: &mut R) -> io::Result<Self> {
+        let num_vals = VInt::deserialize(reader)?.0 as u32;
+        let codec_type = U128FastFieldCodecType::deserialize(reader)?;
+        Ok(U128Header {
+            num_vals,
+            codec_type,
+        })
+    }
+}
+
+/// Serializes u128 values with the compact space codec.
+pub fn serialize_column_values_u128<T: MonotonicallyMappableToU128>(
+    iterable: &dyn Iterable<T>,
+    output: &mut impl io::Write,
+) -> io::Result<()> {
+    let compressor = CompactSpaceCompressor::train_from(
+        iterable
+            .boxed_iter()
+            .map(MonotonicallyMappableToU128::to_u128),
+    );
+    let header = U128Header {
+        num_vals: compressor.num_vals(),
+        codec_type: U128FastFieldCodecType::CompactSpace,
+    };
+    header.serialize(output)?;
+    compressor.compress_into(
+        iterable
+            .boxed_iter()
+            .map(MonotonicallyMappableToU128::to_u128),
+        output,
+    )?;
+    Ok(())
+}
+
+#[derive(PartialEq, Eq, PartialOrd, Ord, Debug, Clone, Copy)]
+#[repr(u8)]
+/// Available codecs to use to encode the u128 (via [`MonotonicallyMappableToU128`]) converted data.
+pub(crate) enum U128FastFieldCodecType {
+    /// This codec takes a large number space (u128) and reduces it to a compact number space, by
+    /// removing the holes.
+    CompactSpace = 1,
+}
+
+impl BinarySerializable for U128FastFieldCodecType {
+    fn serialize<W: Write + ?Sized>(&self, wrt: &mut W) -> io::Result<()> {
+        self.to_code().serialize(wrt)
+    }
+
+    fn deserialize<R: io::Read>(reader: &mut R) -> io::Result<Self> {
+        let code = u8::deserialize(reader)?;
+        let codec_type: Self = Self::from_code(code)
+            .ok_or_else(|| io::Error::new(io::ErrorKind::InvalidData, "Unknown code `{code}.`"))?;
+        Ok(codec_type)
+    }
+}
+
+impl U128FastFieldCodecType {
+    pub(crate) fn to_code(self) -> u8 {
+        self as u8
+    }
+
+    pub(crate) fn from_code(code: u8) -> Option<Self> {
+        match code {
+            1 => Some(Self::CompactSpace),
+            _ => None,
+        }
+    }
+}
+
+/// Returns the correct codec reader wrapped in the `Arc` for the data.
+pub fn open_u128_mapped<T: MonotonicallyMappableToU128 + Debug>(
+    mut bytes: OwnedBytes,
+) -> io::Result<Arc<dyn ColumnValues<T>>> {
+    let header = U128Header::deserialize(&mut bytes)?;
+    assert_eq!(header.codec_type, U128FastFieldCodecType::CompactSpace);
+    let reader = CompactSpaceDecompressor::open(bytes)?;
+    let inverted: StrictlyMonotonicMappingInverter<StrictlyMonotonicMappingToInternal<T>> =
+        StrictlyMonotonicMappingToInternal::<T>::new().into();
+    Ok(Arc::new(monotonic_map_column(reader, inverted)))
+}
+#[cfg(test)]
+pub mod tests {
+    use super::*;
+    use crate::column_values::u64_based::{
+        serialize_and_load_u64_based_column_values, serialize_u64_based_column_values,
+        ALL_U64_CODEC_TYPES,
+    };
+    use crate::column_values::CodecType;
+
+    #[test]
+    fn test_serialize_deserialize_u128_header() {
+        let original = U128Header {
+            num_vals: 11,
+            codec_type: U128FastFieldCodecType::CompactSpace,
+        };
+        let mut out = Vec::new();
+        original.serialize(&mut out).unwrap();
+        let restored = U128Header::deserialize(&mut &out[..]).unwrap();
+        assert_eq!(restored, original);
+    }
+
+    #[test]
+    fn test_serialize_deserialize() {
+        let original = [1u64, 5u64, 10u64];
+        let restored: Vec<u64> =
+            serialize_and_load_u64_based_column_values(&&original[..], &ALL_U64_CODEC_TYPES)
+                .iter()
+                .collect();
+        assert_eq!(&restored, &original[..]);
+    }
+
+    #[test]
+    fn test_fastfield_bool_size_bitwidth_1() {
+        let mut buffer = Vec::new();
+        serialize_u64_based_column_values::<bool>(
+            &&[false, true][..],
+            &ALL_U64_CODEC_TYPES,
+            &mut buffer,
+        )
+        .unwrap();
+        // TODO put the header as a footer so that it serves as a padding.
+        // 5 bytes of header, 1 byte of value, 7 bytes of padding.
+        assert_eq!(buffer.len(), 5 + 1);
+    }
+
+    #[test]
+    fn test_fastfield_bool_bit_size_bitwidth_0() {
+        let mut buffer = Vec::new();
+        serialize_u64_based_column_values::<bool>(
+            &&[false, true][..],
+            &ALL_U64_CODEC_TYPES,
+            &mut buffer,
+        )
+        .unwrap();
+        // 6 bytes of header, 0 bytes of value, 7 bytes of padding.
+        assert_eq!(buffer.len(), 6);
+    }
+
+    #[test]
+    fn test_fastfield_gcd() {
+        let mut buffer = Vec::new();
+        let vals: Vec<u64> = (0..80).map(|val| (val % 7) * 1_000u64).collect();
+        serialize_u64_based_column_values(&&vals[..], &[CodecType::Bitpacked], &mut buffer)
+            .unwrap();
+        // Values are stored over 3 bits.
+        assert_eq!(buffer.len(), 6 + (3 * 80 / 8));
+    }
+}
--- a/columnar/src/column_values/u64_based/bitpacked.rs
+++ b/columnar/src/column_values/u64_based/bitpacked.rs
@@ -0,0 +1,189 @@
+use std::io::{self, Write};
+use std::num::NonZeroU64;
+use std::ops::{Range, RangeInclusive};
+
+use common::{BinarySerializable, OwnedBytes};
+use fastdivide::DividerU64;
+use tantivy_bitpacker::{compute_num_bits, BitPacker, BitUnpacker};
+
+use crate::column_values::u64_based::{ColumnCodec, ColumnCodecEstimator, ColumnStats};
+use crate::{ColumnValues, RowId};
+
+/// Depending on the field type, a different
+/// fast field is required.
+#[derive(Clone)]
+pub struct BitpackedReader {
+    data: OwnedBytes,
+    bit_unpacker: BitUnpacker,
+    stats: ColumnStats,
+}
+
+#[inline(always)]
+const fn div_ceil(n: u64, q: NonZeroU64) -> u64 {
+    // copied from unstable rust standard library.
+    let d = n / q.get();
+    let r = n % q.get();
+    if r > 0 {
+        d + 1
+    } else {
+        d
+    }
+}
+
+// The bitpacked codec applies a linear transformation `f` over data that are bitpacked.
+// f is defined by:
+// f: bitpacked -> stats.min_value + stats.gcd * bitpacked
+//
+// In order to run range queries, we invert the transformation.
+// `transform_range_before_linear_transformation` returns the range of values
+// [min_bipacked_value..max_bitpacked_value] such that
+// f(bitpacked) ∈ [min_value, max_value] <=> bitpacked ∈ [min_bitpacked_value, max_bitpacked_value]
+fn transform_range_before_linear_transformation(
+    stats: &ColumnStats,
+    range: RangeInclusive<u64>,
+) -> Option<RangeInclusive<u64>> {
+    if range.is_empty() {
+        return None;
+    }
+    if stats.min_value > *range.end() {
+        return None;
+    }
+    if stats.max_value < *range.start() {
+        return None;
+    }
+    let shifted_range =
+        range.start().saturating_sub(stats.min_value)..=range.end().saturating_sub(stats.min_value);
+    let start_before_gcd_multiplication: u64 = div_ceil(*shifted_range.start(), stats.gcd);
+    let end_before_gcd_multiplication: u64 = *shifted_range.end() / stats.gcd;
+    Some(start_before_gcd_multiplication..=end_before_gcd_multiplication)
+}
+
+impl ColumnValues for BitpackedReader {
+    #[inline(always)]
+    fn get_val(&self, doc: u32) -> u64 {
+        self.stats.min_value + self.stats.gcd.get() * self.bit_unpacker.get(doc, &self.data)
+    }
+
+    #[inline]
+    fn min_value(&self) -> u64 {
+        self.stats.min_value
+    }
+    #[inline]
+    fn max_value(&self) -> u64 {
+        self.stats.max_value
+    }
+    #[inline]
+    fn num_vals(&self) -> RowId {
+        self.stats.num_rows
+    }
+
+    fn get_row_ids_for_value_range(
+        &self,
+        range: RangeInclusive<u64>,
+        doc_id_range: Range<u32>,
+        positions: &mut Vec<u32>,
+    ) {
+        let Some(transformed_range) =
+            transform_range_before_linear_transformation(&self.stats, range)
+        else {
+            positions.clear();
+            return;
+        };
+        self.bit_unpacker.get_ids_for_value_range(
+            transformed_range,
+            doc_id_range,
+            &self.data,
+            positions,
+        );
+    }
+}
+
+fn num_bits(stats: &ColumnStats) -> u8 {
+    compute_num_bits(stats.amplitude() / stats.gcd)
+}
+
+#[derive(Default)]
+pub struct BitpackedCodecEstimator;
+
+impl ColumnCodecEstimator for BitpackedCodecEstimator {
+    fn collect(&mut self, _value: u64) {}
+
+    fn estimate(&self, stats: &ColumnStats) -> Option<u64> {
+        let num_bits_per_value = num_bits(stats);
+        Some(stats.num_bytes() + (stats.num_rows as u64 * (num_bits_per_value as u64) + 7) / 8)
+    }
+
+    fn serialize(
+        &self,
+        stats: &ColumnStats,
+        vals: &mut dyn Iterator<Item = u64>,
+        wrt: &mut dyn Write,
+    ) -> io::Result<()> {
+        stats.serialize(wrt)?;
+        let num_bits = num_bits(stats);
+        let mut bit_packer = BitPacker::new();
+        let divider = DividerU64::divide_by(stats.gcd.get());
+        for val in vals {
+            bit_packer.write(divider.divide(val - stats.min_value), num_bits, wrt)?;
+        }
+        bit_packer.close(wrt)?;
+        Ok(())
+    }
+}
+
+pub struct BitpackedCodec;
+
+impl ColumnCodec for BitpackedCodec {
+    type ColumnValues = BitpackedReader;
+    type Estimator = BitpackedCodecEstimator;
+
+    /// Opens a fast field given a file.
+    fn load(mut data: OwnedBytes) -> io::Result<Self::ColumnValues> {
+        let stats = ColumnStats::deserialize(&mut data)?;
+        let num_bits = num_bits(&stats);
+        let bit_unpacker = BitUnpacker::new(num_bits);
+        Ok(BitpackedReader {
+            data,
+            bit_unpacker,
+            stats,
+        })
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use crate::column_values::u64_based::tests::create_and_validate;
+
+    #[test]
+    fn test_with_codec_data_sets_simple() {
+        create_and_validate::<BitpackedCodec>(&[4, 3, 12], "name");
+    }
+
+    #[test]
+    fn test_with_codec_data_sets_simple_gcd() {
+        create_and_validate::<BitpackedCodec>(&[1000, 2000, 3000], "name");
+    }
+
+    #[test]
+    fn test_with_codec_data_sets() {
+        let data_sets = crate::column_values::u64_based::tests::get_codec_test_datasets();
+        for (mut data, name) in data_sets {
+            create_and_validate::<BitpackedCodec>(&data, name);
+            data.reverse();
+            create_and_validate::<BitpackedCodec>(&data, name);
+        }
+    }
+
+    #[test]
+    fn bitpacked_fast_field_rand() {
+        for _ in 0..500 {
+            let mut data = (0..1 + rand::random::<u8>() as usize)
+                .map(|_| rand::random::<i64>() as u64 / 2)
+                .collect::<Vec<_>>();
+            create_and_validate::<BitpackedCodec>(&data, "rand");
+            data.reverse();
+            create_and_validate::<BitpackedCodec>(&data, "rand");
+        }
+    }
+}
--- a/columnar/src/column_values/u64_based/blockwise_linear.rs
+++ b/columnar/src/column_values/u64_based/blockwise_linear.rs
@@ -0,0 +1,281 @@
+use std::io::Write;
+use std::sync::Arc;
+use std::{io, iter};
+
+use common::{BinarySerializable, CountingWriter, DeserializeFrom, OwnedBytes};
+use fastdivide::DividerU64;
+use tantivy_bitpacker::{compute_num_bits, BitPacker, BitUnpacker};
+
+use crate::column_values::u64_based::line::Line;
+use crate::column_values::u64_based::{ColumnCodec, ColumnCodecEstimator, ColumnStats};
+use crate::column_values::{ColumnValues, VecColumn};
+use crate::MonotonicallyMappableToU64;
+
+const BLOCK_SIZE: u32 = 512u32;
+
+#[derive(Debug, Default)]
+struct Block {
+    line: Line,
+    bit_unpacker: BitUnpacker,
+    data_start_offset: usize,
+}
+
+impl BinarySerializable for Block {
+    fn serialize<W: Write + ?Sized>(&self, writer: &mut W) -> io::Result<()> {
+        self.line.serialize(writer)?;
+        self.bit_unpacker.bit_width().serialize(writer)?;
+        Ok(())
+    }
+
+    fn deserialize<R: io::Read>(reader: &mut R) -> io::Result<Self> {
+        let line = Line::deserialize(reader)?;
+        let bit_width = u8::deserialize(reader)?;
+        Ok(Block {
+            line,
+            bit_unpacker: BitUnpacker::new(bit_width),
+            data_start_offset: 0,
+        })
+    }
+}
+
+fn compute_num_blocks(num_vals: u32) -> u32 {
+    (num_vals + BLOCK_SIZE - 1) / BLOCK_SIZE
+}
+
+pub struct BlockwiseLinearEstimator {
+    block: Vec<u64>,
+    values_num_bytes: u64,
+    meta_num_bytes: u64,
+}
+
+impl Default for BlockwiseLinearEstimator {
+    fn default() -> Self {
+        Self {
+            block: Vec::with_capacity(BLOCK_SIZE as usize),
+            values_num_bytes: 0u64,
+            meta_num_bytes: 0u64,
+        }
+    }
+}
+
+impl BlockwiseLinearEstimator {
+    fn flush_block_estimate(&mut self) {
+        if self.block.is_empty() {
+            return;
+        }
+        let line = Line::train(&VecColumn::from(&self.block));
+        let mut max_value = 0u64;
+        for (i, buffer_val) in self.block.iter().enumerate() {
+            let interpolated_val = line.eval(i as u32);
+            let val = buffer_val.wrapping_sub(interpolated_val);
+            max_value = val.max(max_value);
+        }
+        let bit_width = compute_num_bits(max_value) as usize;
+        self.values_num_bytes += (bit_width * self.block.len() + 7) as u64 / 8;
+        self.meta_num_bytes += 1 + line.num_bytes();
+    }
+}
+
+impl ColumnCodecEstimator for BlockwiseLinearEstimator {
+    fn collect(&mut self, value: u64) {
+        self.block.push(value);
+        if self.block.len() == BLOCK_SIZE as usize {
+            self.flush_block_estimate();
+            self.block.clear();
+        }
+    }
+    fn estimate(&self, stats: &ColumnStats) -> Option<u64> {
+        let mut estimate = 4 + stats.num_bytes() + self.meta_num_bytes + self.values_num_bytes;
+        if stats.gcd.get() > 1 {
+            let estimate_gain_from_gcd =
+                (stats.gcd.get() as f32).log2().floor() * stats.num_rows as f32 / 8.0f32;
+            estimate = estimate.saturating_sub(estimate_gain_from_gcd as u64);
+        }
+        Some(estimate)
+    }
+
+    fn finalize(&mut self) {
+        self.flush_block_estimate();
+    }
+
+    fn serialize(
+        &self,
+        stats: &ColumnStats,
+        mut vals: &mut dyn Iterator<Item = u64>,
+        wrt: &mut dyn Write,
+    ) -> io::Result<()> {
+        stats.serialize(wrt)?;
+        let mut buffer = Vec::with_capacity(BLOCK_SIZE as usize);
+        let num_blocks = compute_num_blocks(stats.num_rows) as usize;
+        let mut blocks = Vec::with_capacity(num_blocks);
+
+        let mut bit_packer = BitPacker::new();
+
+        let gcd_divider = DividerU64::divide_by(stats.gcd.get());
+
+        for _ in 0..num_blocks {
+            buffer.clear();
+            buffer.extend(
+                (&mut vals)
+                    .map(MonotonicallyMappableToU64::to_u64)
+                    .take(BLOCK_SIZE as usize),
+            );
+
+            for buffer_val in buffer.iter_mut() {
+                *buffer_val = gcd_divider.divide(*buffer_val - stats.min_value);
+            }
+
+            let line = Line::train(&VecColumn::from(&buffer));
+
+            assert!(!buffer.is_empty());
+
+            for (i, buffer_val) in buffer.iter_mut().enumerate() {
+                let interpolated_val = line.eval(i as u32);
+                *buffer_val = buffer_val.wrapping_sub(interpolated_val);
+            }
+
+            let bit_width = buffer.iter().copied().map(compute_num_bits).max().unwrap();
+
+            for &buffer_val in &buffer {
+                bit_packer.write(buffer_val, bit_width, wrt)?;
+            }
+
+            blocks.push(Block {
+                line,
+                bit_unpacker: BitUnpacker::new(bit_width),
+                data_start_offset: 0,
+            });
+        }
+
+        bit_packer.close(wrt)?;
+
+        assert_eq!(blocks.len(), num_blocks);
+
+        let mut counting_wrt = CountingWriter::wrap(wrt);
+        for block in &blocks {
+            block.serialize(&mut counting_wrt)?;
+        }
+        let footer_len = counting_wrt.written_bytes();
+        (footer_len as u32).serialize(&mut counting_wrt)?;
+
+        Ok(())
+    }
+}
+
+pub struct BlockwiseLinearCodec;
+
+impl ColumnCodec<u64> for BlockwiseLinearCodec {
+    type ColumnValues = BlockwiseLinearReader;
+
+    type Estimator = BlockwiseLinearEstimator;
+
+    fn load(mut bytes: OwnedBytes) -> io::Result<Self::ColumnValues> {
+        let stats = ColumnStats::deserialize(&mut bytes)?;
+        let footer_len: u32 = (&bytes[bytes.len() - 4..]).deserialize()?;
+        let footer_offset = bytes.len() - 4 - footer_len as usize;
+        let (data, mut footer) = bytes.split(footer_offset);
+        let num_blocks = compute_num_blocks(stats.num_rows);
+        let mut blocks: Vec<Block> = iter::repeat_with(|| Block::deserialize(&mut footer))
+            .take(num_blocks as usize)
+            .collect::<io::Result<_>>()?;
+        let mut start_offset = 0;
+        for block in &mut blocks {
+            block.data_start_offset = start_offset;
+            start_offset += (block.bit_unpacker.bit_width() as usize) * BLOCK_SIZE as usize / 8;
+        }
+        Ok(BlockwiseLinearReader {
+            blocks: blocks.into_boxed_slice().into(),
+            data,
+            stats,
+        })
+    }
+}
+
+#[derive(Clone)]
+pub struct BlockwiseLinearReader {
+    blocks: Arc<[Block]>,
+    data: OwnedBytes,
+    stats: ColumnStats,
+}
+
+impl ColumnValues for BlockwiseLinearReader {
+    #[inline(always)]
+    fn get_val(&self, idx: u32) -> u64 {
+        let block_id = (idx / BLOCK_SIZE) as usize;
+        let idx_within_block = idx % BLOCK_SIZE;
+        let block = &self.blocks[block_id];
+        let interpoled_val: u64 = block.line.eval(idx_within_block);
+        let block_bytes = &self.data[block.data_start_offset..];
+        let bitpacked_diff = block.bit_unpacker.get(idx_within_block, block_bytes);
+        // TODO optimize me! the line parameters could be tweaked to include the multiplication and
+        // remove the dependency.
+        self.stats.min_value
+            + self
+                .stats
+                .gcd
+                .get()
+                .wrapping_mul(interpoled_val.wrapping_add(bitpacked_diff))
+    }
+
+    #[inline(always)]
+    fn min_value(&self) -> u64 {
+        self.stats.min_value
+    }
+
+    #[inline(always)]
+    fn max_value(&self) -> u64 {
+        self.stats.max_value
+    }
+
+    #[inline(always)]
+    fn num_vals(&self) -> u32 {
+        self.stats.num_rows
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use crate::column_values::u64_based::tests::create_and_validate;
+
+    #[test]
+    fn test_with_codec_data_sets_simple() {
+        create_and_validate::<BlockwiseLinearCodec>(
+            &[11, 20, 40, 20, 10, 10, 10, 10, 10, 10],
+            "simple test",
+        )
+        .unwrap();
+    }
+
+    #[test]
+    fn test_with_codec_data_sets_simple_gcd() {
+        let (_, actual_compression_rate) = create_and_validate::<BlockwiseLinearCodec>(
+            &[10, 20, 40, 20, 10, 10, 10, 10, 10, 10],
+            "name",
+        )
+        .unwrap();
+        assert_eq!(actual_compression_rate, 0.175);
+    }
+
+    #[test]
+    fn test_with_codec_data_sets() {
+        let data_sets = crate::column_values::u64_based::tests::get_codec_test_datasets();
+        for (mut data, name) in data_sets {
+            create_and_validate::<BlockwiseLinearCodec>(&data, name);
+            data.reverse();
+            create_and_validate::<BlockwiseLinearCodec>(&data, name);
+        }
+    }
+
+    #[test]
+    fn test_blockwise_linear_fast_field_rand() {
+        for _ in 0..500 {
+            let mut data = (0..1 + rand::random::<u8>() as usize)
+                .map(|_| rand::random::<i64>() as u64 / 2)
+                .collect::<Vec<_>>();
+            create_and_validate::<BlockwiseLinearCodec>(&data, "rand");
+            data.reverse();
+            create_and_validate::<BlockwiseLinearCodec>(&data, "rand");
+        }
+    }
+}
--- a/columnar/src/column_values/u64_based/line.rs
+++ b/columnar/src/column_values/u64_based/line.rs
@@ -17,8 +17,8 @@ const MID_POINT: u64 = (1u64 << 32) - 1u64;
 /// `y = m * x >> 32 + b`
 #[derive(Debug, Clone, Copy, Default)]
 pub struct Line {
-    slope: u64,
-    intercept: u64,
+    pub(crate) slope: u64,
+    pub(crate) intercept: u64,
 }

 /// Compute the line slope.
@@ -67,21 +67,8 @@ impl Line {
        self.intercept.wrapping_add(linear_part)
    }

-    // Same as train, but the intercept is only estimated from provided sample positions
-    pub fn estimate(sample_positions_and_values: &[(u64, u64)]) -> Self {
-        let first_val = sample_positions_and_values[0].1;
-        let last_val = sample_positions_and_values[sample_positions_and_values.len() - 1].1;
-        let num_vals = sample_positions_and_values[sample_positions_and_values.len() - 1].0 + 1;
-        Self::train_from(
-            first_val,
-            last_val,
-            num_vals as u32,
-            sample_positions_and_values.iter().cloned(),
-        )
-    }
-
    // Intercept is only computed from provided positions
-    fn train_from(
+    pub fn train_from(
        first_val: u64,
        last_val: u64,
        num_vals: u32,
@@ -145,6 +132,7 @@ impl Line {
    ///
    /// This function is only invariable by translation if all of the
    /// `ys` are packaged into half of the space. (See heuristic below)
+    /// TODO USE array
    pub fn train(ys: &dyn ColumnValues) -> Self {
        let first_val = ys.iter().next().unwrap();
        let last_val = ys.iter().nth(ys.num_vals() as usize - 1).unwrap();
@@ -158,7 +146,7 @@ impl Line {
 }

 impl BinarySerializable for Line {
-    fn serialize<W: io::Write>(&self, writer: &mut W) -> io::Result<()> {
+    fn serialize<W: io::Write + ?Sized>(&self, writer: &mut W) -> io::Result<()> {
        VInt(self.slope).serialize(writer)?;
        VInt(self.intercept).serialize(writer)?;
        Ok(())
--- a/columnar/src/column_values/u64_based/linear.rs
+++ b/columnar/src/column_values/u64_based/linear.rs
@@ -0,0 +1,277 @@
+use std::io;
+
+use common::{BinarySerializable, OwnedBytes};
+use tantivy_bitpacker::{compute_num_bits, BitPacker, BitUnpacker};
+
+use super::line::Line;
+use super::ColumnValues;
+use crate::column_values::u64_based::{ColumnCodec, ColumnCodecEstimator, ColumnStats};
+use crate::column_values::VecColumn;
+use crate::RowId;
+
+const HALF_SPACE: u64 = u64::MAX / 2;
+const LINE_ESTIMATION_BLOCK_LEN: usize = 512;
+
+/// Depending on the field type, a different
+/// fast field is required.
+#[derive(Clone)]
+pub struct LinearReader {
+    data: OwnedBytes,
+    linear_params: LinearParams,
+    stats: ColumnStats,
+}
+
+impl ColumnValues for LinearReader {
+    #[inline]
+    fn get_val(&self, doc: u32) -> u64 {
+        let interpoled_val: u64 = self.linear_params.line.eval(doc);
+        let bitpacked_diff = self.linear_params.bit_unpacker.get(doc, &self.data);
+        interpoled_val.wrapping_add(bitpacked_diff)
+    }
+
+    #[inline(always)]
+    fn min_value(&self) -> u64 {
+        self.stats.min_value
+    }
+
+    #[inline(always)]
+    fn max_value(&self) -> u64 {
+        self.stats.max_value
+    }
+
+    #[inline]
+    fn num_vals(&self) -> u32 {
+        self.stats.num_rows
+    }
+}
+
+/// Fastfield serializer, which tries to guess values by linear interpolation
+/// and stores the difference bitpacked.
+pub struct LinearCodec;
+
+#[derive(Debug, Clone)]
+struct LinearParams {
+    line: Line,
+    bit_unpacker: BitUnpacker,
+}
+
+impl BinarySerializable for LinearParams {
+    fn serialize<W: io::Write + ?Sized>(&self, writer: &mut W) -> io::Result<()> {
+        self.line.serialize(writer)?;
+        self.bit_unpacker.bit_width().serialize(writer)?;
+        Ok(())
+    }
+
+    fn deserialize<R: io::Read>(reader: &mut R) -> io::Result<Self> {
+        let line = Line::deserialize(reader)?;
+        let bit_width = u8::deserialize(reader)?;
+        Ok(Self {
+            line,
+            bit_unpacker: BitUnpacker::new(bit_width),
+        })
+    }
+}
+
+pub struct LinearCodecEstimator {
+    block: Vec<u64>,
+    line: Option<Line>,
+    row_id: RowId,
+    min_deviation: u64,
+    max_deviation: u64,
+    first_val: u64,
+    last_val: u64,
+}
+
+impl Default for LinearCodecEstimator {
+    fn default() -> LinearCodecEstimator {
+        LinearCodecEstimator {
+            block: Vec::with_capacity(LINE_ESTIMATION_BLOCK_LEN),
+            line: None,
+            row_id: 0,
+            min_deviation: u64::MAX,
+            max_deviation: u64::MIN,
+            first_val: 0u64,
+            last_val: 0u64,
+        }
+    }
+}
+
+impl ColumnCodecEstimator for LinearCodecEstimator {
+    fn finalize(&mut self) {
+        if let Some(line) = self.line.as_mut() {
+            line.intercept = line
+                .intercept
+                .wrapping_add(self.min_deviation)
+                .wrapping_sub(HALF_SPACE);
+        }
+    }
+
+    fn estimate(&self, stats: &ColumnStats) -> Option<u64> {
+        let line = self.line?;
+        let amplitude = self.max_deviation - self.min_deviation;
+        let num_bits = compute_num_bits(amplitude);
+        let linear_params = LinearParams {
+            line,
+            bit_unpacker: BitUnpacker::new(num_bits),
+        };
+        Some(
+            stats.num_bytes()
+                + linear_params.num_bytes()
+                + (num_bits as u64 * stats.num_rows as u64 + 7) / 8,
+        )
+    }
+
+    fn serialize(
+        &self,
+        stats: &ColumnStats,
+        vals: &mut dyn Iterator<Item = u64>,
+        wrt: &mut dyn io::Write,
+    ) -> io::Result<()> {
+        stats.serialize(wrt)?;
+        let line = self.line.unwrap();
+        let amplitude = self.max_deviation - self.min_deviation;
+        let num_bits = compute_num_bits(amplitude);
+        let linear_params = LinearParams {
+            line,
+            bit_unpacker: BitUnpacker::new(num_bits),
+        };
+        linear_params.serialize(wrt)?;
+        let mut bit_packer = BitPacker::new();
+        for (pos, value) in vals.enumerate() {
+            let calculated_value = line.eval(pos as u32);
+            let offset = value.wrapping_sub(calculated_value);
+            bit_packer.write(offset, num_bits, wrt)?;
+        }
+        bit_packer.close(wrt)?;
+        Ok(())
+    }
+
+    fn collect(&mut self, value: u64) {
+        if let Some(line) = self.line {
+            self.collect_after_line_estimation(&line, value);
+        } else {
+            self.collect_before_line_estimation(value);
+        }
+    }
+}
+
+impl LinearCodecEstimator {
+    #[inline]
+    fn collect_after_line_estimation(&mut self, line: &Line, value: u64) {
+        let interpoled_val: u64 = line.eval(self.row_id);
+        let deviation = value.wrapping_add(HALF_SPACE).wrapping_sub(interpoled_val);
+        self.min_deviation = self.min_deviation.min(deviation);
+        self.max_deviation = self.max_deviation.max(deviation);
+        if self.row_id == 0 {
+            self.first_val = value;
+        }
+        self.last_val = value;
+        self.row_id += 1u32;
+    }
+
+    #[inline]
+    fn collect_before_line_estimation(&mut self, value: u64) {
+        self.block.push(value);
+        if self.block.len() == LINE_ESTIMATION_BLOCK_LEN {
+            let line = Line::train(&VecColumn::from(&self.block));
+            let block = std::mem::take(&mut self.block);
+            for val in block {
+                self.collect_after_line_estimation(&line, val);
+            }
+            self.line = Some(line);
+        }
+    }
+}
+
+impl ColumnCodec for LinearCodec {
+    type ColumnValues = LinearReader;
+
+    type Estimator = LinearCodecEstimator;
+
+    fn load(mut data: OwnedBytes) -> io::Result<Self::ColumnValues> {
+        let stats = ColumnStats::deserialize(&mut data)?;
+        let linear_params = LinearParams::deserialize(&mut data)?;
+        Ok(LinearReader {
+            stats,
+            linear_params,
+            data,
+        })
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use rand::RngCore;
+
+    use super::*;
+    use crate::column_values::u64_based::tests::{create_and_validate, get_codec_test_datasets};
+
+    #[test]
+    fn test_compression_simple() {
+        let vals = (100u64..)
+            .take(super::LINE_ESTIMATION_BLOCK_LEN)
+            .collect::<Vec<_>>();
+        create_and_validate::<LinearCodec>(&vals, "simple monotonically large").unwrap();
+    }
+
+    #[test]
+    fn test_compression() {
+        let data = (10..=6_000_u64).collect::<Vec<_>>();
+        let (estimate, actual_compression) =
+            create_and_validate::<LinearCodec>(&data, "simple monotonically large").unwrap();
+        assert_le!(actual_compression, 0.001);
+        assert_le!(estimate, 0.02);
+    }
+
+    #[test]
+    fn test_with_codec_datasets() {
+        let data_sets = get_codec_test_datasets();
+        for (mut data, name) in data_sets {
+            create_and_validate::<LinearCodec>(&data, name);
+            data.reverse();
+            create_and_validate::<LinearCodec>(&data, name);
+        }
+    }
+    #[test]
+    fn linear_interpol_fast_field_test_large_amplitude() {
+        let data = vec![
+            i64::MAX as u64 / 2,
+            i64::MAX as u64 / 3,
+            i64::MAX as u64 / 2,
+        ];
+        create_and_validate::<LinearCodec>(&data, "large amplitude");
+    }
+
+    #[test]
+    fn overflow_error_test() {
+        let data = vec![1572656989877777, 1170935903116329, 720575940379279, 0];
+        create_and_validate::<LinearCodec>(&data, "overflow test");
+    }
+
+    #[test]
+    fn linear_interpol_fast_concave_data() {
+        let data = vec![0, 1, 2, 5, 8, 10, 20, 50];
+        create_and_validate::<LinearCodec>(&data, "concave data");
+    }
+    #[test]
+    fn linear_interpol_fast_convex_data() {
+        let data = vec![0, 40, 60, 70, 75, 77];
+        create_and_validate::<LinearCodec>(&data, "convex data");
+    }
+    #[test]
+    fn linear_interpol_fast_field_test_simple() {
+        let data = (10..=20_u64).collect::<Vec<_>>();
+        create_and_validate::<LinearCodec>(&data, "simple monotonically");
+    }
+
+    #[test]
+    fn linear_interpol_fast_field_rand() {
+        let mut rng = rand::thread_rng();
+        for _ in 0..50 {
+            let mut data = (0..10_000).map(|_| rng.next_u64()).collect::<Vec<_>>();
+            create_and_validate::<LinearCodec>(&data, "random");
+            data.reverse();
+            create_and_validate::<LinearCodec>(&data, "random");
+        }
+    }
+}
--- a/columnar/src/column_values/u64_based/mod.rs
+++ b/columnar/src/column_values/u64_based/mod.rs
@@ -0,0 +1,214 @@
+mod bitpacked;
+mod blockwise_linear;
+mod line;
+mod linear;
+mod stats_collector;
+
+use std::io;
+use std::io::Write;
+use std::sync::Arc;
+
+use common::{BinarySerializable, OwnedBytes};
+
+use crate::column_values::monotonic_mapping::{
+    StrictlyMonotonicMappingInverter, StrictlyMonotonicMappingToInternal,
+};
+pub use crate::column_values::u64_based::bitpacked::BitpackedCodec;
+pub use crate::column_values::u64_based::blockwise_linear::BlockwiseLinearCodec;
+pub use crate::column_values::u64_based::linear::LinearCodec;
+pub use crate::column_values::u64_based::stats_collector::StatsCollector;
+use crate::column_values::{monotonic_map_column, ColumnStats};
+use crate::iterable::Iterable;
+use crate::{ColumnValues, MonotonicallyMappableToU64};
+
+/// A `ColumnCodecEstimator` is in charge of gathering all
+/// data required to serialize a column.
+///
+/// This happens during a first pass on data of the column elements.
+/// During that pass, all column estimators receive a call to their
+/// `.collect(el)`.
+///
+/// After this first pass, finalize is called.
+/// `.estimate(..)` then should return an accurate estimation of the
+/// size of the serialized column (were we to pick this codec.).
+/// `.serialize(..)` then serializes the column using this codec.
+pub trait ColumnCodecEstimator<T = u64>: 'static {
+    /// Records a new value for estimation.
+    /// This method will be called for each element of the column during
+    /// `estimation`.
+    fn collect(&mut self, value: u64);
+    /// Finalizes the first pass phase.
+    fn finalize(&mut self) {}
+    /// Returns an accurate estimation of the number of bytes that will
+    /// be used to represent this column.
+    fn estimate(&self, stats: &ColumnStats) -> Option<u64>;
+    /// Serializes the column using the given codec.
+    /// This constitutes a second pass over the columns values.
+    fn serialize(
+        &self,
+        stats: &ColumnStats,
+        vals: &mut dyn Iterator<Item = T>,
+        wrt: &mut dyn io::Write,
+    ) -> io::Result<()>;
+}
+
+/// A column codec describes a colunm serialization format.
+pub trait ColumnCodec<T: PartialOrd = u64> {
+    /// Specialized `ColumnValues` type.
+    type ColumnValues: ColumnValues<T> + 'static;
+    /// `Estimator` for the given codec.
+    type Estimator: ColumnCodecEstimator + Default;
+
+    /// Loads a column that has been serialized using this codec.
+    fn load(bytes: OwnedBytes) -> io::Result<Self::ColumnValues>;
+
+    /// Returns an estimator.
+    fn estimator() -> Self::Estimator {
+        Self::Estimator::default()
+    }
+
+    /// Returns a boxed estimator.
+    fn boxed_estimator() -> Box<dyn ColumnCodecEstimator> {
+        Box::new(Self::estimator())
+    }
+}
+
+/// Available codecs to use to encode the u64 (via [`MonotonicallyMappableToU64`]) converted data.
+#[derive(PartialEq, Eq, PartialOrd, Ord, Debug, Clone, Copy)]
+#[repr(u8)]
+pub enum CodecType {
+    /// Bitpack all values in the value range. The number of bits is defined by the amplitude
+    /// `column.max_value() - column.min_value()`
+    Bitpacked = 0u8,
+    /// Linear interpolation puts a line between the first and last value and then bitpacks the
+    /// values by the offset from the line. The number of bits is defined by the max deviation from
+    /// the line.
+    Linear = 1u8,
+    /// Same as [`CodecType::Linear`], but encodes in blocks of 512 elements.
+    BlockwiseLinear = 2u8,
+}
+
+/// List of all available u64-base codecs.
+pub const ALL_U64_CODEC_TYPES: [CodecType; 3] = [
+    CodecType::Bitpacked,
+    CodecType::Linear,
+    CodecType::BlockwiseLinear,
+];
+
+impl CodecType {
+    fn to_code(self) -> u8 {
+        self as u8
+    }
+
+    fn try_from_code(code: u8) -> Option<CodecType> {
+        match code {
+            0u8 => Some(CodecType::Bitpacked),
+            1u8 => Some(CodecType::Linear),
+            2u8 => Some(CodecType::BlockwiseLinear),
+            _ => None,
+        }
+    }
+
+    fn load<T: MonotonicallyMappableToU64>(
+        &self,
+        bytes: OwnedBytes,
+    ) -> io::Result<Arc<dyn ColumnValues<T>>> {
+        match self {
+            CodecType::Bitpacked => load_specific_codec::<BitpackedCodec, T>(bytes),
+            CodecType::Linear => load_specific_codec::<LinearCodec, T>(bytes),
+            CodecType::BlockwiseLinear => load_specific_codec::<BlockwiseLinearCodec, T>(bytes),
+        }
+    }
+}
+
+fn load_specific_codec<C: ColumnCodec, T: MonotonicallyMappableToU64>(
+    bytes: OwnedBytes,
+) -> io::Result<Arc<dyn ColumnValues<T>>> {
+    let reader = C::load(bytes)?;
+    let reader_typed = monotonic_map_column(
+        reader,
+        StrictlyMonotonicMappingInverter::from(StrictlyMonotonicMappingToInternal::<T>::new()),
+    );
+    Ok(Arc::new(reader_typed))
+}
+
+impl CodecType {
+    /// Returns a boxed codec estimator associated to a given `CodecType`.
+    pub fn estimator(&self) -> Box<dyn ColumnCodecEstimator> {
+        match self {
+            CodecType::Bitpacked => BitpackedCodec::boxed_estimator(),
+            CodecType::Linear => LinearCodec::boxed_estimator(),
+            CodecType::BlockwiseLinear => BlockwiseLinearCodec::boxed_estimator(),
+        }
+    }
+}
+
+/// Serializes a given column of u64-mapped values.
+pub fn serialize_u64_based_column_values<T: MonotonicallyMappableToU64>(
+    vals: &dyn Iterable<T>,
+    codec_types: &[CodecType],
+    wrt: &mut dyn Write,
+) -> io::Result<()> {
+    let mut stats_collector = StatsCollector::default();
+    let mut estimators: Vec<(CodecType, Box<dyn ColumnCodecEstimator>)> =
+        Vec::with_capacity(codec_types.len());
+    for &codec_type in codec_types {
+        estimators.push((codec_type, codec_type.estimator()));
+    }
+    for val in vals.boxed_iter() {
+        let val_u64 = val.to_u64();
+        stats_collector.collect(val_u64);
+        for (_, estimator) in &mut estimators {
+            estimator.collect(val_u64);
+        }
+    }
+    for (_, estimator) in &mut estimators {
+        estimator.finalize();
+    }
+    let stats = stats_collector.stats();
+    let (_, best_codec, best_codec_estimator) = estimators
+        .into_iter()
+        .flat_map(|(codec_type, estimator)| {
+            let num_bytes = estimator.estimate(&stats)?;
+            Some((num_bytes, codec_type, estimator))
+        })
+        .min_by_key(|(num_bytes, _, _)| *num_bytes)
+        .ok_or_else(|| {
+            io::Error::new(io::ErrorKind::InvalidData, "No available applicable codec.")
+        })?;
+    best_codec.to_code().serialize(wrt)?;
+    best_codec_estimator.serialize(
+        &stats,
+        &mut vals.boxed_iter().map(MonotonicallyMappableToU64::to_u64),
+        wrt,
+    )?;
+    Ok(())
+}
+
+/// Load u64-based column values.
+///
+/// This method first identifies the codec off the first byte.
+pub fn load_u64_based_column_values<T: MonotonicallyMappableToU64>(
+    mut bytes: OwnedBytes,
+) -> io::Result<Arc<dyn ColumnValues<T>>> {
+    let codec_type: CodecType = bytes
+        .first()
+        .copied()
+        .and_then(CodecType::try_from_code)
+        .ok_or_else(|| io::Error::new(io::ErrorKind::InvalidData, "Failed to read codec type"))?;
+    bytes.advance(1);
+    codec_type.load(bytes)
+}
+
+/// Helper function to serialize a column (autodetect from all codecs) and then open it
+pub fn serialize_and_load_u64_based_column_values<T: MonotonicallyMappableToU64>(
+    vals: &dyn Iterable,
+    codec_types: &[CodecType],
+) -> Arc<dyn ColumnValues<T>> {
+    let mut buffer = Vec::new();
+    serialize_u64_based_column_values(vals, codec_types, &mut buffer).unwrap();
+    load_u64_based_column_values::<T>(OwnedBytes::new(buffer)).unwrap()
+}
+
+#[cfg(test)]
+mod tests;
--- a/columnar/src/column_values/u64_based/stats_collector.rs
+++ b/columnar/src/column_values/u64_based/stats_collector.rs
@@ -0,0 +1,200 @@
+use std::num::NonZeroU64;
+
+use fastdivide::DividerU64;
+
+use crate::column_values::ColumnStats;
+use crate::RowId;
+
+/// Compute the gcd of two non null numbers.
+///
+/// It is recommended, but not required, to feed values such that `large >= small`.
+fn compute_gcd(mut large: NonZeroU64, mut small: NonZeroU64) -> NonZeroU64 {
+    loop {
+        let rem: u64 = large.get() % small;
+        if let Some(new_small) = NonZeroU64::new(rem) {
+            (large, small) = (small, new_small);
+        } else {
+            return small;
+        }
+    }
+}
+
+#[derive(Default)]
+pub struct StatsCollector {
+    min_max_opt: Option<(u64, u64)>,
+    num_rows: RowId,
+    // We measure the GCD of the difference between the values and the minimal value.
+    // This is the same as computing the difference between the values and the first value.
+    //
+    // This way, we can compress i64-converted-to-u64 (e.g. timestamp that were supplied in
+    // seconds, only to be converted in nanoseconds).
+    increment_gcd_opt: Option<(NonZeroU64, DividerU64)>,
+    first_value_opt: Option<u64>,
+}
+
+impl StatsCollector {
+    pub fn stats(&self) -> ColumnStats {
+        let (min_value, max_value) = self.min_max_opt.unwrap_or((0u64, 0u64));
+        let increment_gcd = if let Some((increment_gcd, _)) = self.increment_gcd_opt {
+            increment_gcd
+        } else {
+            NonZeroU64::new(1u64).unwrap()
+        };
+        ColumnStats {
+            min_value,
+            max_value,
+            num_rows: self.num_rows,
+            gcd: increment_gcd,
+        }
+    }
+
+    #[inline]
+    fn update_increment_gcd(&mut self, value: u64) {
+        let Some(first_value) = self.first_value_opt else {
+            // We set the first value and just quit.
+            self.first_value_opt = Some(value);
+            return;
+        };
+        let Some(non_zero_value) = NonZeroU64::new(value.abs_diff(first_value)) else {
+            // We can simply skip 0 values.
+            return;
+        };
+        let Some((gcd, gcd_divider)) = self.increment_gcd_opt else {
+            self.set_increment_gcd(non_zero_value);
+            return;
+        };
+        if gcd.get() == 1 {
+            // It won't see any update now.
+            return;
+        }
+        let remainder =
+            non_zero_value.get() - (gcd_divider.divide(non_zero_value.get())) * gcd.get();
+        if remainder == 0 {
+            return;
+        }
+        let new_gcd = compute_gcd(non_zero_value, gcd);
+        self.set_increment_gcd(new_gcd);
+    }
+
+    fn set_increment_gcd(&mut self, gcd: NonZeroU64) {
+        let new_divider = DividerU64::divide_by(gcd.get());
+        self.increment_gcd_opt = Some((gcd, new_divider));
+    }
+
+    pub fn collect(&mut self, value: u64) {
+        self.min_max_opt = Some(if let Some((min, max)) = self.min_max_opt {
+            (min.min(value), max.max(value))
+        } else {
+            (value, value)
+        });
+        self.num_rows += 1;
+        self.update_increment_gcd(value);
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use std::num::NonZeroU64;
+
+    use crate::column_values::u64_based::stats_collector::{compute_gcd, StatsCollector};
+    use crate::column_values::u64_based::ColumnStats;
+
+    fn compute_stats(vals: impl Iterator<Item = u64>) -> ColumnStats {
+        let mut stats_collector = StatsCollector::default();
+        for val in vals {
+            stats_collector.collect(val);
+        }
+        stats_collector.stats()
+    }
+
+    fn find_gcd(vals: impl Iterator<Item = u64>) -> u64 {
+        compute_stats(vals).gcd.get()
+    }
+
+    #[test]
+    fn test_compute_gcd() {
+        let test_compute_gcd_aux = |large, small, expected| {
+            let large = NonZeroU64::new(large).unwrap();
+            let small = NonZeroU64::new(small).unwrap();
+            let expected = NonZeroU64::new(expected).unwrap();
+            assert_eq!(compute_gcd(small, large), expected);
+            assert_eq!(compute_gcd(large, small), expected);
+        };
+        test_compute_gcd_aux(1, 4, 1);
+        test_compute_gcd_aux(2, 4, 2);
+        test_compute_gcd_aux(10, 25, 5);
+        test_compute_gcd_aux(25, 25, 25);
+    }
+
+    #[test]
+    fn test_gcd() {
+        assert_eq!(find_gcd([0].into_iter()), 1);
+        assert_eq!(find_gcd([0, 10].into_iter()), 10);
+        assert_eq!(find_gcd([10, 0].into_iter()), 10);
+        assert_eq!(find_gcd([].into_iter()), 1);
+        assert_eq!(find_gcd([15, 30, 5, 10].into_iter()), 5);
+        assert_eq!(find_gcd([15, 16, 10].into_iter()), 1);
+        assert_eq!(find_gcd([0, 5, 5, 5].into_iter()), 5);
+        assert_eq!(find_gcd([0, 0].into_iter()), 1);
+        assert_eq!(find_gcd([1, 10, 4, 1, 7, 10].into_iter()), 3);
+        assert_eq!(find_gcd([1, 10, 0, 4, 1, 7, 10].into_iter()), 1);
+    }
+
+    #[test]
+    fn test_stats() {
+        assert_eq!(
+            compute_stats([].into_iter()),
+            ColumnStats {
+                gcd: NonZeroU64::new(1).unwrap(),
+                min_value: 0,
+                max_value: 0,
+                num_rows: 0
+            }
+        );
+        assert_eq!(
+            compute_stats([0, 1].into_iter()),
+            ColumnStats {
+                gcd: NonZeroU64::new(1).unwrap(),
+                min_value: 0,
+                max_value: 1,
+                num_rows: 2
+            }
+        );
+        assert_eq!(
+            compute_stats([0, 1].into_iter()),
+            ColumnStats {
+                gcd: NonZeroU64::new(1).unwrap(),
+                min_value: 0,
+                max_value: 1,
+                num_rows: 2
+            }
+        );
+        assert_eq!(
+            compute_stats([10, 20, 30].into_iter()),
+            ColumnStats {
+                gcd: NonZeroU64::new(10).unwrap(),
+                min_value: 10,
+                max_value: 30,
+                num_rows: 3
+            }
+        );
+        assert_eq!(
+            compute_stats([10, 50, 10, 30].into_iter()),
+            ColumnStats {
+                gcd: NonZeroU64::new(20).unwrap(),
+                min_value: 10,
+                max_value: 50,
+                num_rows: 4
+            }
+        );
+        assert_eq!(
+            compute_stats([10, 0, 30].into_iter()),
+            ColumnStats {
+                gcd: NonZeroU64::new(10).unwrap(),
+                min_value: 0,
+                max_value: 30,
+                num_rows: 3
+            }
+        );
+    }
+}
--- a/columnar/src/column_values/u64_based/tests.rs
+++ b/columnar/src/column_values/u64_based/tests.rs
@@ -0,0 +1,415 @@
+use proptest::prelude::*;
+use proptest::strategy::Strategy;
+use proptest::{prop_oneof, proptest};
+
+#[test]
+fn test_serialize_and_load_simple() {
+    let mut buffer = Vec::new();
+    let vals = &[1u64, 2u64, 5u64];
+    serialize_u64_based_column_values(
+        &&vals[..],
+        &[CodecType::Bitpacked, CodecType::BlockwiseLinear],
+        &mut buffer,
+    )
+    .unwrap();
+    assert_eq!(buffer.len(), 7);
+    let col = load_u64_based_column_values::<u64>(OwnedBytes::new(buffer)).unwrap();
+    assert_eq!(col.num_vals(), 3);
+    assert_eq!(col.get_val(0), 1);
+    assert_eq!(col.get_val(1), 2);
+    assert_eq!(col.get_val(2), 5);
+}
+
+#[test]
+fn test_empty_column_i64() {
+    let vals: [i64; 0] = [];
+    let mut num_acceptable_codecs = 0;
+    for codec in ALL_U64_CODEC_TYPES {
+        let mut buffer = Vec::new();
+        if serialize_u64_based_column_values(&&vals[..], &[codec], &mut buffer).is_err() {
+            continue;
+        }
+        num_acceptable_codecs += 1;
+        let col = load_u64_based_column_values::<i64>(OwnedBytes::new(buffer)).unwrap();
+        assert_eq!(col.num_vals(), 0);
+        assert_eq!(col.min_value(), i64::MIN);
+        assert_eq!(col.max_value(), i64::MIN);
+    }
+    assert!(num_acceptable_codecs > 0);
+}
+
+#[test]
+fn test_empty_column_u64() {
+    let vals: [u64; 0] = [];
+    let mut num_acceptable_codecs = 0;
+    for codec in ALL_U64_CODEC_TYPES {
+        let mut buffer = Vec::new();
+        if serialize_u64_based_column_values(&&vals[..], &[codec], &mut buffer).is_err() {
+            continue;
+        }
+        num_acceptable_codecs += 1;
+        let col = load_u64_based_column_values::<u64>(OwnedBytes::new(buffer)).unwrap();
+        assert_eq!(col.num_vals(), 0);
+        assert_eq!(col.min_value(), u64::MIN);
+        assert_eq!(col.max_value(), u64::MIN);
+    }
+    assert!(num_acceptable_codecs > 0);
+}
+
+#[test]
+fn test_empty_column_f64() {
+    let vals: [f64; 0] = [];
+    let mut num_acceptable_codecs = 0;
+    for codec in ALL_U64_CODEC_TYPES {
+        let mut buffer = Vec::new();
+        if serialize_u64_based_column_values(&&vals[..], &[codec], &mut buffer).is_err() {
+            continue;
+        }
+        num_acceptable_codecs += 1;
+        let col = load_u64_based_column_values::<f64>(OwnedBytes::new(buffer)).unwrap();
+        assert_eq!(col.num_vals(), 0);
+        // FIXME. f64::MIN would be better!
+        assert!(col.min_value().is_nan());
+        assert!(col.max_value().is_nan());
+    }
+    assert!(num_acceptable_codecs > 0);
+}
+
+pub(crate) fn create_and_validate<TColumnCodec: ColumnCodec>(
+    vals: &[u64],
+    name: &str,
+) -> Option<(f32, f32)> {
+    let mut stats_collector = StatsCollector::default();
+    let mut codec_estimator: TColumnCodec::Estimator = Default::default();
+
+    for val in vals.boxed_iter() {
+        stats_collector.collect(val);
+        codec_estimator.collect(val);
+    }
+    codec_estimator.finalize();
+    let stats = stats_collector.stats();
+    let estimation = codec_estimator.estimate(&stats)?;
+
+    let mut buffer = Vec::new();
+    codec_estimator
+        .serialize(&stats, vals.boxed_iter().as_mut(), &mut buffer)
+        .unwrap();
+
+    let actual_compression = buffer.len() as u64;
+
+    let reader = TColumnCodec::load(OwnedBytes::new(buffer)).unwrap();
+    assert_eq!(reader.num_vals(), vals.len() as u32);
+    let mut buffer = Vec::new();
+    for (doc, orig_val) in vals.iter().copied().enumerate() {
+        let val = reader.get_val(doc as u32);
+        assert_eq!(
+            val, orig_val,
+            "val `{val}` does not match orig_val {orig_val:?}, in data set {name}, data `{vals:?}`",
+        );
+
+        buffer.resize(1, 0);
+        reader.get_vals(&[doc as u32], &mut buffer);
+        let val = buffer[0];
+        assert_eq!(
+            val, orig_val,
+            "val `{val}` does not match orig_val {orig_val:?}, in data set {name}, data `{vals:?}`",
+        );
+    }
+
+    let all_docs: Vec<u32> = (0..vals.len() as u32).collect();
+    buffer.resize(all_docs.len(), 0);
+    reader.get_vals(&all_docs, &mut buffer);
+    assert_eq!(vals, buffer);
+
+    if !vals.is_empty() {
+        let test_rand_idx = rand::thread_rng().gen_range(0..=vals.len() - 1);
+        let expected_positions: Vec<u32> = vals
+            .iter()
+            .enumerate()
+            .filter(|(_, el)| **el == vals[test_rand_idx])
+            .map(|(pos, _)| pos as u32)
+            .collect();
+        let mut positions = Vec::new();
+        reader.get_row_ids_for_value_range(
+            vals[test_rand_idx]..=vals[test_rand_idx],
+            0..vals.len() as u32,
+            &mut positions,
+        );
+        assert_eq!(expected_positions, positions);
+    }
+    if actual_compression > 1000 {
+        assert!(relative_difference(estimation, actual_compression) < 0.10f32);
+    }
+    Some((
+        compression_rate(estimation, stats.num_rows),
+        compression_rate(actual_compression, stats.num_rows),
+    ))
+}
+
+fn compression_rate(num_bytes: u64, num_values: u32) -> f32 {
+    num_bytes as f32 / (num_values as f32 * 8.0)
+}
+
+fn relative_difference(left: u64, right: u64) -> f32 {
+    let left = left as f32;
+    let right = right as f32;
+    2.0f32 * (left - right).abs() / (left + right)
+}
+
+proptest! {
+    #![proptest_config(ProptestConfig::with_cases(100))]
+
+    #[test]
+    fn test_proptest_small_bitpacked(data in proptest::collection::vec(num_strategy(), 1..10)) {
+        create_and_validate::<BitpackedCodec>(&data, "proptest bitpacked");
+    }
+
+    #[test]
+    fn test_proptest_small_linear(data in proptest::collection::vec(num_strategy(), 1..10)) {
+        create_and_validate::<LinearCodec>(&data, "proptest linearinterpol");
+    }
+
+
+    #[test]
+    fn test_proptest_small_blockwise_linear(data in proptest::collection::vec(num_strategy(), 1..10)) {
+        create_and_validate::<BlockwiseLinearCodec>(&data, "proptest multilinearinterpol");
+    }
+}
+
+#[test]
+fn test_small_blockwise_linear_example() {
+    create_and_validate::<BlockwiseLinearCodec>(
+        &[9223372036854775808, 9223370937344622593],
+        "proptest multilinearinterpol",
+    );
+}
+
+proptest! {
+    #![proptest_config(ProptestConfig::with_cases(10))]
+
+    #[test]
+    fn test_proptest_large_bitpacked(data in proptest::collection::vec(num_strategy(), 1..6000)) {
+        create_and_validate::<BitpackedCodec>(&data, "proptest bitpacked");
+    }
+
+    #[test]
+    fn test_proptest_large_linear(data in proptest::collection::vec(num_strategy(), 1..6000)) {
+        create_and_validate::<LinearCodec>(&data, "proptest linearinterpol");
+    }
+
+    #[test]
+    fn test_proptest_large_blockwise_linear(data in proptest::collection::vec(num_strategy(), 1..6000)) {
+        create_and_validate::<BlockwiseLinearCodec>(&data, "proptest multilinearinterpol");
+    }
+}
+
+fn num_strategy() -> impl Strategy<Value = u64> {
+    prop_oneof![
+        1 => prop::num::u64::ANY.prop_map(|num| u64::MAX - (num % 10) ),
+        1 => prop::num::u64::ANY.prop_map(|num| num % 10 ),
+        20 => prop::num::u64::ANY,
+    ]
+}
+
+pub fn get_codec_test_datasets() -> Vec<(Vec<u64>, &'static str)> {
+    let mut data_and_names = vec![];
+
+    let data = (10..=10_000_u64).collect::<Vec<_>>();
+    data_and_names.push((data, "simple monotonically increasing"));
+
+    data_and_names.push((
+        vec![5, 6, 7, 8, 9, 10, 99, 100],
+        "offset in linear interpol",
+    ));
+    data_and_names.push((vec![5, 50, 3, 13, 1, 1000, 35], "rand small"));
+    data_and_names.push((vec![10], "single value"));
+
+    data_and_names.push((
+        vec![1572656989877777, 1170935903116329, 720575940379279, 0],
+        "overflow error",
+    ));
+
+    data_and_names
+}
+
+fn test_codec<C: ColumnCodec>() {
+    let codec_name = std::any::type_name::<C>();
+    for (data, dataset_name) in get_codec_test_datasets() {
+        let estimate_actual_opt: Option<(f32, f32)> =
+            tests::create_and_validate::<C>(&data, dataset_name);
+        let result = if let Some((estimate, actual)) = estimate_actual_opt {
+            format!("Estimate `{estimate}` Actual `{actual}`")
+        } else {
+            "Disabled".to_string()
+        };
+        println!("Codec {codec_name}, DataSet {dataset_name}, {result}");
+    }
+}
+#[test]
+fn test_codec_bitpacking() {
+    test_codec::<BitpackedCodec>();
+}
+#[test]
+fn test_codec_interpolation() {
+    test_codec::<LinearCodec>();
+}
+#[test]
+fn test_codec_multi_interpolation() {
+    test_codec::<BlockwiseLinearCodec>();
+}
+
+use super::*;
+
+fn estimate<C: ColumnCodec>(vals: &[u64]) -> Option<f32> {
+    let mut stats_collector = StatsCollector::default();
+    let mut estimator = C::Estimator::default();
+    for &val in vals {
+        stats_collector.collect(val);
+        estimator.collect(val);
+    }
+    estimator.finalize();
+    let stats = stats_collector.stats();
+    let num_bytes = estimator.estimate(&stats)?;
+    if stats.num_rows == 0 {
+        return None;
+    }
+    Some(num_bytes as f32 / (8.0 * stats.num_rows as f32))
+}
+
+#[test]
+fn estimation_good_interpolation_case() {
+    let data = (10..=20000_u64).collect::<Vec<_>>();
+
+    let linear_interpol_estimation = estimate::<LinearCodec>(&data).unwrap();
+    assert_le!(linear_interpol_estimation, 0.01);
+
+    let multi_linear_interpol_estimation = estimate::<BlockwiseLinearCodec>(&data).unwrap();
+    assert_le!(multi_linear_interpol_estimation, 0.2);
+    assert_lt!(linear_interpol_estimation, multi_linear_interpol_estimation);
+
+    let bitpacked_estimation = estimate::<BitpackedCodec>(&data).unwrap();
+    assert_lt!(linear_interpol_estimation, bitpacked_estimation);
+}
+
+#[test]
+fn estimation_test_bad_interpolation_case_monotonically_increasing() {
+    let mut data: Vec<u64> = (201..=20000_u64).collect();
+    data.push(1_000_000);
+
+    // in this case the linear interpolation can't in fact not be worse than bitpacking,
+    // but the estimator adds some threshold, which leads to estimated worse behavior
+    let linear_interpol_estimation = estimate::<LinearCodec>(&data[..]).unwrap();
+    assert_le!(linear_interpol_estimation, 0.35);
+
+    let bitpacked_estimation = estimate::<BitpackedCodec>(&data).unwrap();
+    assert_le!(bitpacked_estimation, 0.32);
+    assert_le!(bitpacked_estimation, linear_interpol_estimation);
+}
+
+#[test]
+fn test_fast_field_codec_type_to_code() {
+    let mut count_codec = 0;
+    for code in 0..=255 {
+        if let Some(codec_type) = CodecType::try_from_code(code) {
+            assert_eq!(codec_type.to_code(), code);
+            count_codec += 1;
+        }
+    }
+    assert_eq!(count_codec, 3);
+}
+
+fn test_fastfield_gcd_i64_with_codec(codec_type: CodecType, num_vals: usize) -> io::Result<()> {
+    let mut vals: Vec<i64> = (-4..=(num_vals as i64) - 5).map(|val| val * 1000).collect();
+    let mut buffer: Vec<u8> = Vec::new();
+    crate::column_values::serialize_u64_based_column_values(
+        &&vals[..],
+        &[codec_type],
+        &mut buffer,
+    )?;
+    let buffer = OwnedBytes::new(buffer);
+    let column = crate::column_values::load_u64_based_column_values::<i64>(buffer.clone())?;
+    assert_eq!(column.get_val(0), -4000i64);
+    assert_eq!(column.get_val(1), -3000i64);
+    assert_eq!(column.get_val(2), -2000i64);
+    assert_eq!(column.max_value(), (num_vals as i64 - 5) * 1000);
+    assert_eq!(column.min_value(), -4000i64);
+
+    // Can't apply gcd
+    let mut buffer_without_gcd = Vec::new();
+    vals.pop();
+    vals.push(1001i64);
+    crate::column_values::serialize_u64_based_column_values(
+        &&vals[..],
+        &[codec_type],
+        &mut buffer_without_gcd,
+    )?;
+    let buffer_without_gcd = OwnedBytes::new(buffer_without_gcd);
+    assert!(buffer_without_gcd.len() > buffer.len());
+
+    Ok(())
+}
+
+#[test]
+fn test_fastfield_gcd_i64() -> io::Result<()> {
+    for &codec_type in &[
+        CodecType::Bitpacked,
+        CodecType::BlockwiseLinear,
+        CodecType::Linear,
+    ] {
+        test_fastfield_gcd_i64_with_codec(codec_type, 5500)?;
+    }
+    Ok(())
+}
+
+fn test_fastfield_gcd_u64_with_codec(codec_type: CodecType, num_vals: usize) -> io::Result<()> {
+    let mut vals: Vec<u64> = (1..=num_vals).map(|i| i as u64 * 1000u64).collect();
+    let mut buffer: Vec<u8> = Vec::new();
+    crate::column_values::serialize_u64_based_column_values(
+        &&vals[..],
+        &[codec_type],
+        &mut buffer,
+    )?;
+    let buffer = OwnedBytes::new(buffer);
+    let column = crate::column_values::load_u64_based_column_values::<u64>(buffer.clone())?;
+    assert_eq!(column.get_val(0), 1000u64);
+    assert_eq!(column.get_val(1), 2000u64);
+    assert_eq!(column.get_val(2), 3000u64);
+    assert_eq!(column.max_value(), num_vals as u64 * 1000);
+    assert_eq!(column.min_value(), 1000u64);
+
+    // Can't apply gcd
+    let mut buffer_without_gcd = Vec::new();
+    vals.pop();
+    vals.push(1001u64);
+    crate::column_values::serialize_u64_based_column_values(
+        &&vals[..],
+        &[codec_type],
+        &mut buffer_without_gcd,
+    )?;
+    let buffer_without_gcd = OwnedBytes::new(buffer_without_gcd);
+    assert!(buffer_without_gcd.len() > buffer.len());
+    Ok(())
+}
+
+#[test]
+fn test_fastfield_gcd_u64() -> io::Result<()> {
+    for &codec_type in &[
+        CodecType::Bitpacked,
+        CodecType::BlockwiseLinear,
+        CodecType::Linear,
+    ] {
+        test_fastfield_gcd_u64_with_codec(codec_type, 5500)?;
+    }
+    Ok(())
+}
+
+#[test]
+pub fn test_fastfield2() {
+    let test_fastfield = crate::column_values::serialize_and_load_u64_based_column_values::<u64>(
+        &&[100u64, 200u64, 300u64][..],
+        &ALL_U64_CODEC_TYPES,
+    );
+    assert_eq!(test_fastfield.get_val(0), 100);
+    assert_eq!(test_fastfield.get_val(1), 200);
+    assert_eq!(test_fastfield.get_val(2), 300);
+}
--- a/columnar/src/column_values/vec_column.rs
+++ b/columnar/src/column_values/vec_column.rs
@@ -0,0 +1,52 @@
+use std::fmt::Debug;
+
+use tantivy_bitpacker::minmax;
+
+use crate::ColumnValues;
+
+/// VecColumn provides `Column` over a slice.
+pub struct VecColumn<'a, T = u64> {
+    pub(crate) values: &'a [T],
+    pub(crate) min_value: T,
+    pub(crate) max_value: T,
+}
+
+impl<'a, T: Copy + PartialOrd + Send + Sync + Debug> ColumnValues<T> for VecColumn<'a, T> {
+    fn get_val(&self, position: u32) -> T {
+        self.values[position as usize]
+    }
+
+    fn iter(&self) -> Box<dyn Iterator<Item = T> + '_> {
+        Box::new(self.values.iter().copied())
+    }
+
+    fn min_value(&self) -> T {
+        self.min_value
+    }
+
+    fn max_value(&self) -> T {
+        self.max_value
+    }
+
+    fn num_vals(&self) -> u32 {
+        self.values.len() as u32
+    }
+
+    fn get_range(&self, start: u64, output: &mut [T]) {
+        output.copy_from_slice(&self.values[start as usize..][..output.len()])
+    }
+}
+
+impl<'a, T: Copy + PartialOrd + Default, V> From<&'a V> for VecColumn<'a, T>
+where V: AsRef<[T]> + ?Sized
+{
+    fn from(values: &'a V) -> Self {
+        let values = values.as_ref();
+        let (min_value, max_value) = minmax(values.iter().copied()).unwrap_or_default();
+        Self {
+            values,
+            min_value,
+            max_value,
+        }
+    }
+}
--- a/columnar/src/columnar/column_type.rs
+++ b/columnar/src/columnar/column_type.rs
@@ -1,11 +1,15 @@
+use std::fmt;
+use std::fmt::Debug;
 use std::net::Ipv6Addr;

+use serde::{Deserialize, Serialize};
+
 use crate::value::NumericalType;
 use crate::InvalidData;

 /// The column type represents the column type.
 /// Any changes need to be propagated to `COLUMN_TYPES`.
-#[derive(Hash, Eq, PartialEq, Debug, Clone, Copy, Ord, PartialOrd)]
+#[derive(Hash, Eq, PartialEq, Debug, Clone, Copy, Ord, PartialOrd, Serialize, Deserialize)]
 #[repr(u8)]
 pub enum ColumnType {
    I64 = 0u8,
@@ -18,6 +22,22 @@ pub enum ColumnType {
    DateTime = 7u8,
 }

+impl fmt::Display for ColumnType {
+    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
+        let short_str = match self {
+            ColumnType::I64 => "i64",
+            ColumnType::U64 => "u64",
+            ColumnType::F64 => "f64",
+            ColumnType::Bytes => "bytes",
+            ColumnType::Str => "str",
+            ColumnType::Bool => "bool",
+            ColumnType::IpAddr => "ip",
+            ColumnType::DateTime => "datetime",
+        };
+        write!(f, "{short_str}")
+    }
+}
+
 // The order needs to match _exactly_ the order in the enum
 const COLUMN_TYPES: [ColumnType; 8] = [
    ColumnType::I64,
@@ -34,6 +54,9 @@ impl ColumnType {
    pub fn to_code(self) -> u8 {
        self as u8
    }
+    pub fn is_date_time(&self) -> bool {
+        self == &ColumnType::DateTime
+    }

    pub(crate) fn try_from_code(code: u8) -> Result<ColumnType, InvalidData> {
        COLUMN_TYPES.get(code as usize).copied().ok_or(InvalidData)
@@ -66,7 +89,7 @@ impl ColumnType {
 }

 // TODO remove if possible
-pub trait HasAssociatedColumnType: 'static + Send + Sync + Copy + PartialOrd {
+pub trait HasAssociatedColumnType: 'static + Debug + Send + Sync + Copy + PartialOrd {
    fn column_type() -> ColumnType;
    fn default_value() -> Self;
 }
@@ -110,7 +133,7 @@ impl HasAssociatedColumnType for bool {
    }
 }

-impl HasAssociatedColumnType for crate::DateTime {
+impl HasAssociatedColumnType for common::DateTime {
    fn column_type() -> ColumnType {
        ColumnType::DateTime
    }
@@ -142,7 +165,7 @@ mod tests {
            }
        }
        for code in COLUMN_TYPES.len() as u8..=u8::MAX {
-            assert!(ColumnType::try_from_code(code as u8).is_err());
+            assert!(ColumnType::try_from_code(code).is_err());
        }
    }

--- a/columnar/src/columnar/format_version.rs
+++ b/columnar/src/columnar/format_version.rs
@@ -4,7 +4,7 @@ pub const VERSION_FOOTER_NUM_BYTES: usize = MAGIC_BYTES.len() + std::mem::size_o

 /// We end the file by these 4 bytes just to somewhat identify that
 /// this is indeed a columnar file.
-const MAGIC_BYTES: [u8; 4] = [2, 113, 119, 066];
+const MAGIC_BYTES: [u8; 4] = [2, 113, 119, 66];

 pub fn footer() -> [u8; VERSION_FOOTER_NUM_BYTES] {
    let mut footer_bytes = [0u8; VERSION_FOOTER_NUM_BYTES];
@@ -27,8 +27,8 @@ pub enum Version {
 }

 impl Version {
-    fn to_bytes(&self) -> [u8; 4] {
-        (*self as u32).to_le_bytes()
+    fn to_bytes(self) -> [u8; 4] {
+        (self as u32).to_le_bytes()
    }

    fn try_from_bytes(bytes: [u8; 4]) -> Result<Version, InvalidData> {
--- a/columnar/src/columnar/merge.rs
+++ b/columnar/src/columnar/merge.rs
@@ -1,208 +0,0 @@
-use std::collections::HashMap;
-use std::io;
-
-use crate::columnar::ColumnarReader;
-use crate::dynamic_column::DynamicColumn;
-use crate::ColumnType;
-
-pub enum MergeDocOrder {
-    /// Columnar tables are simply stacked one above the other.
-    /// If the i-th columnar_readers has n_rows_i rows, then
-    /// in the resulting columnar,
-    /// rows [r0..n_row_0) contains the row of columnar_readers[0], in ordder
-    /// rows [n_row_0..n_row_0 + n_row_1 contains the row of columnar_readers[1], in order.
-    /// ..
-    Stack,
-    /// Some more complex mapping, that can interleaves rows from the different readers and
-    /// possibly drop rows.
-    Complex(()),
-}
-
-pub fn merge_columnar(
-    _columnar_readers: &[ColumnarReader],
-    mapping: MergeDocOrder,
-    _output: &mut impl io::Write,
-) -> io::Result<()> {
-    match mapping {
-        MergeDocOrder::Stack => {
-            // implement me :)
-            todo!();
-        }
-        MergeDocOrder::Complex(_) => {
-            // for later
-            todo!();
-        }
-    }
-}
-
-/// Column types are grouped into different categories.
-/// After merge, all columns belonging to the same category are coerced to
-/// the same column type.
-///
-/// In practise, today, only Numerical colummns are coerced into one type today.
-///
-/// See also [README.md].
-#[derive(Copy, Clone, Eq, PartialEq, Hash, Debug)]
-#[repr(u8)]
-enum ColumnTypeCategory {
-    Bool,
-    Str,
-    Numerical,
-    DateTime,
-    Bytes,
-    IpAddr,
-}
-
-impl From<ColumnType> for ColumnTypeCategory {
-    fn from(column_type: ColumnType) -> Self {
-        match column_type {
-            ColumnType::I64 => ColumnTypeCategory::Numerical,
-            ColumnType::U64 => ColumnTypeCategory::Numerical,
-            ColumnType::F64 => ColumnTypeCategory::Numerical,
-            ColumnType::Bytes => ColumnTypeCategory::Bytes,
-            ColumnType::Str => ColumnTypeCategory::Str,
-            ColumnType::Bool => ColumnTypeCategory::Bool,
-            ColumnType::IpAddr => ColumnTypeCategory::IpAddr,
-            ColumnType::DateTime => ColumnTypeCategory::DateTime,
-        }
-    }
-}
-
-fn collect_columns(
-    columnar_readers: &[&ColumnarReader],
-) -> io::Result<HashMap<String, HashMap<ColumnTypeCategory, Vec<DynamicColumn>>>> {
-    // Each column name may have multiple types of column associated.
-    // For merging we are interested in the same column type category since they can be merged.
-    let mut field_name_to_group: HashMap<String, HashMap<ColumnTypeCategory, Vec<DynamicColumn>>> =
-        HashMap::new();
-
-    for columnar_reader in columnar_readers {
-        let column_name_and_handle = columnar_reader.list_columns()?;
-        for (column_name, handle) in column_name_and_handle {
-            let column_type_to_handles = field_name_to_group
-                .entry(column_name.to_string())
-                .or_default();
-
-            let columns = column_type_to_handles
-                .entry(handle.column_type().into())
-                .or_default();
-            columns.push(handle.open()?);
-        }
-    }
-
-    normalize_columns(&mut field_name_to_group);
-
-    Ok(field_name_to_group)
-}
-
-/// Coerce numerical type columns to the same type
-/// TODO rename to `coerce_columns`
-fn normalize_columns(map: &mut HashMap<String, HashMap<ColumnTypeCategory, Vec<DynamicColumn>>>) {
-    for (_field_name, type_category_to_columns) in map.iter_mut() {
-        for (type_category, columns) in type_category_to_columns {
-            if type_category == &ColumnTypeCategory::Numerical {
-                let casted_columns = cast_to_common_numerical_column(&columns);
-                *columns = casted_columns;
-            }
-        }
-    }
-}
-
-/// Receives a list of columns of numerical types (u64, i64, f64)
-///
-/// Returns a list of `DynamicColumn` which are all of the same numerical type
-fn cast_to_common_numerical_column(columns: &[DynamicColumn]) -> Vec<DynamicColumn> {
-    assert!(columns
-        .iter()
-        .all(|column| column.column_type().numerical_type().is_some()));
-    let coerce_to_i64: Vec<_> = columns
-        .iter()
-        .map(|column| column.clone().coerce_to_i64())
-        .collect();
-
-    if coerce_to_i64.iter().all(|column| column.is_some()) {
-        return coerce_to_i64
-            .into_iter()
-            .map(|column| column.unwrap())
-            .collect();
-    }
-
-    let coerce_to_u64: Vec<_> = columns
-        .iter()
-        .map(|column| column.clone().coerce_to_u64())
-        .collect();
-
-    if coerce_to_u64.iter().all(|column| column.is_some()) {
-        return coerce_to_u64
-            .into_iter()
-            .map(|column| column.unwrap())
-            .collect();
-    }
-
-    columns
-        .iter()
-        .map(|column| {
-            column
-                .clone()
-                .coerce_to_f64()
-                .expect("couldn't cast column to f64")
-        })
-        .collect()
-}
-
-#[cfg(test)]
-mod tests {
-    use super::*;
-    use crate::ColumnarWriter;
-
-    #[test]
-    fn test_column_coercion() {
-        // i64 type
-        let columnar1 = {
-            let mut dataframe_writer = ColumnarWriter::default();
-            dataframe_writer.record_numerical(1u32, "numbers", 1i64);
-            let mut buffer: Vec<u8> = Vec::new();
-            dataframe_writer.serialize(2, &mut buffer).unwrap();
-            ColumnarReader::open(buffer).unwrap()
-        };
-        // u64 type
-        let columnar2 = {
-            let mut dataframe_writer = ColumnarWriter::default();
-            dataframe_writer.record_numerical(1u32, "numbers", u64::MAX - 100);
-            let mut buffer: Vec<u8> = Vec::new();
-            dataframe_writer.serialize(2, &mut buffer).unwrap();
-            ColumnarReader::open(buffer).unwrap()
-        };
-
-        // f64 type
-        let columnar3 = {
-            let mut dataframe_writer = ColumnarWriter::default();
-            dataframe_writer.record_numerical(1u32, "numbers", 30.5);
-            let mut buffer: Vec<u8> = Vec::new();
-            dataframe_writer.serialize(2, &mut buffer).unwrap();
-            ColumnarReader::open(buffer).unwrap()
-        };
-
-        let column_map = collect_columns(&[&columnar1, &columnar2, &columnar3]).unwrap();
-        assert_eq!(column_map.len(), 1);
-        let cat_to_columns = column_map.get("numbers").unwrap();
-        assert_eq!(cat_to_columns.len(), 1);
-
-        let numerical = cat_to_columns.get(&ColumnTypeCategory::Numerical).unwrap();
-        assert!(numerical.iter().all(|column| column.is_f64()));
-
-        let column_map = collect_columns(&[&columnar1, &columnar1]).unwrap();
-        assert_eq!(column_map.len(), 1);
-        let cat_to_columns = column_map.get("numbers").unwrap();
-        assert_eq!(cat_to_columns.len(), 1);
-        let numerical = cat_to_columns.get(&ColumnTypeCategory::Numerical).unwrap();
-        assert!(numerical.iter().all(|column| column.is_i64()));
-
-        let column_map = collect_columns(&[&columnar2, &columnar2]).unwrap();
-        assert_eq!(column_map.len(), 1);
-        let cat_to_columns = column_map.get("numbers").unwrap();
-        assert_eq!(cat_to_columns.len(), 1);
-        let numerical = cat_to_columns.get(&ColumnTypeCategory::Numerical).unwrap();
-        assert!(numerical.iter().all(|column| column.is_u64()));
-    }
-}
--- a/columnar/src/columnar/merge/merge_dict_column.rs
+++ b/columnar/src/columnar/merge/merge_dict_column.rs
@@ -0,0 +1,210 @@
+use std::io::{self, Write};
+
+use common::{BitSet, CountingWriter, ReadOnlyBitSet};
+use sstable::{SSTable, Streamer, TermOrdinal, VoidSSTable};
+
+use super::term_merger::TermMerger;
+use crate::column::serialize_column_mappable_to_u64;
+use crate::column_index::SerializableColumnIndex;
+use crate::iterable::Iterable;
+use crate::{BytesColumn, MergeRowOrder, ShuffleMergeOrder};
+
+// Serialize [Dictionary, Column, dictionary num bytes U32::LE]
+// Column: [Column Index, Column Values, column index num bytes U32::LE]
+pub fn merge_bytes_or_str_column(
+    column_index: SerializableColumnIndex<'_>,
+    bytes_columns: &[Option<BytesColumn>],
+    merge_row_order: &MergeRowOrder,
+    output: &mut impl Write,
+) -> io::Result<()> {
+    // Serialize dict and generate mapping for values
+    let mut output = CountingWriter::wrap(output);
+    // TODO !!! Remove useless terms.
+    let term_ord_mapping = serialize_merged_dict(bytes_columns, merge_row_order, &mut output)?;
+    let dictionary_num_bytes: u32 = output.written_bytes() as u32;
+    let output = output.finish();
+    let remapped_term_ordinals_values = RemappedTermOrdinalsValues {
+        bytes_columns,
+        term_ord_mapping: &term_ord_mapping,
+        merge_row_order,
+    };
+    serialize_column_mappable_to_u64(column_index, &remapped_term_ordinals_values, output)?;
+    output.write_all(&dictionary_num_bytes.to_le_bytes())?;
+    Ok(())
+}
+
+struct RemappedTermOrdinalsValues<'a> {
+    bytes_columns: &'a [Option<BytesColumn>],
+    term_ord_mapping: &'a TermOrdinalMapping,
+    merge_row_order: &'a MergeRowOrder,
+}
+
+impl<'a> Iterable for RemappedTermOrdinalsValues<'a> {
+    fn boxed_iter(&self) -> Box<dyn Iterator<Item = u64> + '_> {
+        match self.merge_row_order {
+            MergeRowOrder::Stack(_) => self.boxed_iter_stacked(),
+            MergeRowOrder::Shuffled(shuffle_merge_order) => {
+                self.boxed_iter_shuffled(shuffle_merge_order)
+            }
+        }
+    }
+}
+
+impl<'a> RemappedTermOrdinalsValues<'a> {
+    fn boxed_iter_stacked(&self) -> Box<dyn Iterator<Item = u64> + '_> {
+        let iter = self
+            .bytes_columns
+            .iter()
+            .enumerate()
+            .flat_map(|(seg_ord, bytes_column_opt)| {
+                let bytes_column = bytes_column_opt.as_ref()?;
+                Some((seg_ord, bytes_column))
+            })
+            .flat_map(move |(seg_ord, bytes_column)| {
+                let term_ord_after_merge_mapping =
+                    self.term_ord_mapping.get_segment(seg_ord as u32);
+                bytes_column
+                    .ords()
+                    .values
+                    .iter()
+                    .map(move |term_ord| term_ord_after_merge_mapping[term_ord as usize])
+            });
+        Box::new(iter)
+    }
+
+    fn boxed_iter_shuffled<'b>(
+        &'b self,
+        shuffle_merge_order: &'b ShuffleMergeOrder,
+    ) -> Box<dyn Iterator<Item = u64> + 'b> {
+        Box::new(
+            shuffle_merge_order
+                .iter_new_to_old_row_addrs()
+                .flat_map(move |old_addr| {
+                    let segment_ord = self.term_ord_mapping.get_segment(old_addr.segment_ord);
+                    self.bytes_columns[old_addr.segment_ord as usize]
+                        .as_ref()
+                        .into_iter()
+                        .flat_map(move |bytes_column| {
+                            bytes_column
+                                .term_ords(old_addr.row_id)
+                                .map(|old_term_ord: u64| segment_ord[old_term_ord as usize])
+                        })
+                }),
+        )
+    }
+}
+
+fn compute_term_bitset(column: &BytesColumn, row_bitset: &ReadOnlyBitSet) -> BitSet {
+    let num_terms = column.dictionary().num_terms();
+    let mut term_bitset = BitSet::with_max_value(num_terms as u32);
+    for row_id in row_bitset.iter() {
+        for term_ord in column.term_ord_column.values_for_doc(row_id) {
+            term_bitset.insert(term_ord as u32);
+        }
+    }
+    term_bitset
+}
+
+fn is_term_present(bitsets: &[Option<BitSet>], term_merger: &TermMerger) -> bool {
+    for (segment_ord, from_term_ord) in term_merger.matching_segments() {
+        if let Some(bitset) = bitsets[segment_ord].as_ref() {
+            if bitset.contains(from_term_ord as u32) {
+                return true;
+            }
+        } else {
+            return true;
+        }
+    }
+    false
+}
+
+fn serialize_merged_dict(
+    bytes_columns: &[Option<BytesColumn>],
+    merge_row_order: &MergeRowOrder,
+    output: &mut impl Write,
+) -> io::Result<TermOrdinalMapping> {
+    let mut term_ord_mapping = TermOrdinalMapping::default();
+
+    let mut field_term_streams = Vec::new();
+    for column_opt in bytes_columns.iter() {
+        if let Some(column) = column_opt {
+            term_ord_mapping.add_segment(column.dictionary.num_terms());
+            let terms: Streamer<VoidSSTable> = column.dictionary.stream()?;
+            field_term_streams.push(terms);
+        } else {
+            term_ord_mapping.add_segment(0);
+            field_term_streams.push(Streamer::empty());
+        }
+    }
+
+    let mut merged_terms = TermMerger::new(field_term_streams);
+    let mut sstable_builder = sstable::VoidSSTable::writer(output);
+
+    match merge_row_order {
+        MergeRowOrder::Stack(_) => {
+            let mut current_term_ord = 0;
+            while merged_terms.advance() {
+                let term_bytes: &[u8] = merged_terms.key();
+                sstable_builder.insert(term_bytes, &())?;
+                for (segment_ord, from_term_ord) in merged_terms.matching_segments() {
+                    term_ord_mapping.register_from_to(segment_ord, from_term_ord, current_term_ord);
+                }
+                current_term_ord += 1;
+            }
+            sstable_builder.finish()?;
+        }
+        MergeRowOrder::Shuffled(shuffle_merge_order) => {
+            assert_eq!(shuffle_merge_order.alive_bitsets.len(), bytes_columns.len());
+            let mut term_bitsets: Vec<Option<BitSet>> = Vec::with_capacity(bytes_columns.len());
+            for (alive_bitset_opt, bytes_column_opt) in shuffle_merge_order
+                .alive_bitsets
+                .iter()
+                .zip(bytes_columns.iter())
+            {
+                match (alive_bitset_opt, bytes_column_opt) {
+                    (Some(alive_bitset), Some(bytes_column)) => {
+                        let term_bitset = compute_term_bitset(bytes_column, alive_bitset);
+                        term_bitsets.push(Some(term_bitset));
+                    }
+                    _ => {
+                        term_bitsets.push(None);
+                    }
+                }
+            }
+            let mut current_term_ord = 0;
+            while merged_terms.advance() {
+                let term_bytes: &[u8] = merged_terms.key();
+                if !is_term_present(&term_bitsets[..], &merged_terms) {
+                    continue;
+                }
+                sstable_builder.insert(term_bytes, &())?;
+                for (segment_ord, from_term_ord) in merged_terms.matching_segments() {
+                    term_ord_mapping.register_from_to(segment_ord, from_term_ord, current_term_ord);
+                }
+                current_term_ord += 1;
+            }
+            sstable_builder.finish()?;
+        }
+    }
+    Ok(term_ord_mapping)
+}
+
+#[derive(Default, Debug)]
+struct TermOrdinalMapping {
+    per_segment_new_term_ordinals: Vec<Vec<TermOrdinal>>,
+}
+
+impl TermOrdinalMapping {
+    fn add_segment(&mut self, max_term_ord: usize) {
+        self.per_segment_new_term_ordinals
+            .push(vec![TermOrdinal::default(); max_term_ord]);
+    }
+
+    fn register_from_to(&mut self, segment_ord: usize, from_ord: TermOrdinal, to_ord: TermOrdinal) {
+        self.per_segment_new_term_ordinals[segment_ord][from_ord as usize] = to_ord;
+    }
+
+    fn get_segment(&self, segment_ord: u32) -> &[TermOrdinal] {
+        &(self.per_segment_new_term_ordinals[segment_ord as usize])[..]
+    }
+}
--- a/columnar/src/columnar/merge/merge_mapping.rs
+++ b/columnar/src/columnar/merge/merge_mapping.rs
@@ -0,0 +1,129 @@
+use std::ops::Range;
+
+use common::{BitSet, OwnedBytes, ReadOnlyBitSet};
+
+use crate::{ColumnarReader, RowAddr, RowId};
+
+pub struct StackMergeOrder {
+    // This does not start at 0. The first row is the number of
+    // rows in the first columnar.
+    cumulated_row_ids: Vec<RowId>,
+}
+
+impl StackMergeOrder {
+    #[cfg(test)]
+    pub fn stack_for_test(num_rows_per_columnar: &[u32]) -> StackMergeOrder {
+        let mut cumulated_row_ids: Vec<RowId> = Vec::with_capacity(num_rows_per_columnar.len());
+        let mut cumulated_row_id = 0;
+        for &num_rows in num_rows_per_columnar {
+            cumulated_row_id += num_rows;
+            cumulated_row_ids.push(cumulated_row_id);
+        }
+        StackMergeOrder { cumulated_row_ids }
+    }
+
+    pub fn stack(columnars: &[&ColumnarReader]) -> StackMergeOrder {
+        let mut cumulated_row_ids: Vec<RowId> = Vec::with_capacity(columnars.len());
+        let mut cumulated_row_id = 0;
+        for columnar in columnars {
+            cumulated_row_id += columnar.num_rows();
+            cumulated_row_ids.push(cumulated_row_id);
+        }
+        StackMergeOrder { cumulated_row_ids }
+    }
+
+    pub fn num_rows(&self) -> RowId {
+        self.cumulated_row_ids.last().copied().unwrap_or(0)
+    }
+
+    pub fn offset(&self, columnar_id: usize) -> RowId {
+        if columnar_id == 0 {
+            return 0;
+        }
+        self.cumulated_row_ids[columnar_id - 1]
+    }
+
+    pub fn columnar_range(&self, columnar_id: usize) -> Range<RowId> {
+        self.offset(columnar_id)..self.offset(columnar_id + 1)
+    }
+}
+
+pub enum MergeRowOrder {
+    /// Columnar tables are simply stacked one above the other.
+    /// If the i-th columnar_readers has n_rows_i rows, then
+    /// in the resulting columnar,
+    /// rows [r0..n_row_0) contains the row of `columnar_readers[0]`, in ordder
+    /// rows [n_row_0..n_row_0 + n_row_1 contains the row of `columnar_readers[1]`, in order.
+    /// ..
+    /// No documents is deleted.
+    Stack(StackMergeOrder),
+    /// Some more complex mapping, that may interleaves rows from the different readers and
+    /// drop rows, or do both.
+    Shuffled(ShuffleMergeOrder),
+}
+
+impl From<StackMergeOrder> for MergeRowOrder {
+    fn from(stack_merge_order: StackMergeOrder) -> MergeRowOrder {
+        MergeRowOrder::Stack(stack_merge_order)
+    }
+}
+
+impl From<ShuffleMergeOrder> for MergeRowOrder {
+    fn from(shuffle_merge_order: ShuffleMergeOrder) -> MergeRowOrder {
+        MergeRowOrder::Shuffled(shuffle_merge_order)
+    }
+}
+
+impl MergeRowOrder {
+    pub fn num_rows(&self) -> RowId {
+        match self {
+            MergeRowOrder::Stack(stack_row_order) => stack_row_order.num_rows(),
+            MergeRowOrder::Shuffled(complex_mapping) => complex_mapping.num_rows(),
+        }
+    }
+}
+
+pub struct ShuffleMergeOrder {
+    pub new_row_id_to_old_row_id: Vec<RowAddr>,
+    pub alive_bitsets: Vec<Option<ReadOnlyBitSet>>,
+}
+
+impl ShuffleMergeOrder {
+    pub fn for_test(
+        segment_num_rows: &[RowId],
+        new_row_id_to_old_row_id: Vec<RowAddr>,
+    ) -> ShuffleMergeOrder {
+        let mut alive_bitsets: Vec<BitSet> = segment_num_rows
+            .iter()
+            .map(|&num_rows| BitSet::with_max_value(num_rows))
+            .collect();
+        for &RowAddr {
+            segment_ord,
+            row_id,
+        } in &new_row_id_to_old_row_id
+        {
+            alive_bitsets[segment_ord as usize].insert(row_id);
+        }
+        let alive_bitsets: Vec<Option<ReadOnlyBitSet>> = alive_bitsets
+            .into_iter()
+            .map(|alive_bitset| {
+                let mut buffer = Vec::new();
+                alive_bitset.serialize(&mut buffer).unwrap();
+                let data = OwnedBytes::new(buffer);
+                Some(ReadOnlyBitSet::open(data))
+            })
+            .collect();
+        ShuffleMergeOrder {
+            new_row_id_to_old_row_id,
+            alive_bitsets,
+        }
+    }
+
+    pub fn num_rows(&self) -> RowId {
+        self.new_row_id_to_old_row_id.len() as RowId
+    }
+
+    pub fn iter_new_to_old_row_addrs(&self) -> impl Iterator<Item = RowAddr> + '_ {
+        self.new_row_id_to_old_row_id.iter().copied()
+    }
+}
--- a/columnar/src/columnar/merge/mod.rs
+++ b/columnar/src/columnar/merge/mod.rs
@@ -0,0 +1,486 @@
+mod merge_dict_column;
+mod merge_mapping;
+mod term_merger;
+
+use std::collections::{BTreeMap, HashSet};
+use std::io;
+use std::net::Ipv6Addr;
+use std::sync::Arc;
+
+use itertools::Itertools;
+pub use merge_mapping::{MergeRowOrder, ShuffleMergeOrder, StackMergeOrder};
+
+use super::writer::ColumnarSerializer;
+use crate::column::{serialize_column_mappable_to_u128, serialize_column_mappable_to_u64};
+use crate::column_values::MergedColumnValues;
+use crate::columnar::merge::merge_dict_column::merge_bytes_or_str_column;
+use crate::columnar::writer::CompatibleNumericalTypes;
+use crate::columnar::ColumnarReader;
+use crate::dynamic_column::DynamicColumn;
+use crate::{
+    BytesColumn, Column, ColumnIndex, ColumnType, ColumnValues, DynamicColumnHandle, NumericalType,
+    NumericalValue,
+};
+
+/// Column types are grouped into different categories.
+/// After merge, all columns belonging to the same category are coerced to
+/// the same column type.
+///
+/// In practise, today, only Numerical colummns are coerced into one type today.
+///
+/// See also [README.md].
+///
+/// The ordering has to match the ordering of the variants in [ColumnType].
+#[derive(Copy, Clone, Eq, PartialOrd, Ord, PartialEq, Hash, Debug)]
+pub(crate) enum ColumnTypeCategory {
+    Numerical,
+    Bytes,
+    Str,
+    Bool,
+    IpAddr,
+    DateTime,
+}
+
+impl From<ColumnType> for ColumnTypeCategory {
+    fn from(column_type: ColumnType) -> Self {
+        match column_type {
+            ColumnType::I64 => ColumnTypeCategory::Numerical,
+            ColumnType::U64 => ColumnTypeCategory::Numerical,
+            ColumnType::F64 => ColumnTypeCategory::Numerical,
+            ColumnType::Bytes => ColumnTypeCategory::Bytes,
+            ColumnType::Str => ColumnTypeCategory::Str,
+            ColumnType::Bool => ColumnTypeCategory::Bool,
+            ColumnType::IpAddr => ColumnTypeCategory::IpAddr,
+            ColumnType::DateTime => ColumnTypeCategory::DateTime,
+        }
+    }
+}
+
+/// Merge several columnar table together.
+///
+/// If several columns with the same name are conflicting with the numerical types in the
+/// input columnars, the first type compatible out of i64, u64, f64 in that order will be used.
+///
+/// `require_columns` makes it possible to ensure that some columns will be present in the
+/// resulting columnar. When a required column is a numerical column type, one of two things can
+/// happen:
+/// - If the required column type is compatible with all of the input columnar, the resulsting
+///   merged
+/// columnar will simply coerce the input column and use the required column type.
+/// - If the required column type is incompatible with one of the input columnar, the merged
+/// will fail with an InvalidData error.
+///
+/// `merge_row_order` makes it possible to remove or reorder row in the resulting
+/// `Columnar` table.
+///
+/// Reminder: a string and a numerical column may bare the same column name. This is not
+/// considered a conflict.
+pub fn merge_columnar(
+    columnar_readers: &[&ColumnarReader],
+    required_columns: &[(String, ColumnType)],
+    merge_row_order: MergeRowOrder,
+    output: &mut impl io::Write,
+) -> io::Result<()> {
+    let mut serializer = ColumnarSerializer::new(output);
+    let num_rows_per_columnar = columnar_readers
+        .iter()
+        .map(|reader| reader.num_rows())
+        .collect::<Vec<u32>>();
+
+    let columns_to_merge =
+        group_columns_for_merge(columnar_readers, required_columns, &merge_row_order)?;
+    for res in columns_to_merge {
+        let ((column_name, _column_type_category), grouped_columns) = res;
+        let grouped_columns = grouped_columns.open(&merge_row_order)?;
+        if grouped_columns.is_empty() {
+            continue;
+        }
+
+        let column_type = grouped_columns.column_type_after_merge();
+        let mut columns = grouped_columns.columns;
+        coerce_columns(column_type, &mut columns)?;
+
+        let mut column_serializer =
+            serializer.start_serialize_column(column_name.as_bytes(), column_type);
+        merge_column(
+            column_type,
+            &num_rows_per_columnar,
+            columns,
+            &merge_row_order,
+            &mut column_serializer,
+        )?;
+        column_serializer.finalize()?;
+    }
+
+    serializer.finalize(merge_row_order.num_rows())?;
+    Ok(())
+}
+
+fn dynamic_column_to_u64_monotonic(dynamic_column: DynamicColumn) -> Option<Column<u64>> {
+    match dynamic_column {
+        DynamicColumn::Bool(column) => Some(column.to_u64_monotonic()),
+        DynamicColumn::I64(column) => Some(column.to_u64_monotonic()),
+        DynamicColumn::U64(column) => Some(column.to_u64_monotonic()),
+        DynamicColumn::F64(column) => Some(column.to_u64_monotonic()),
+        DynamicColumn::DateTime(column) => Some(column.to_u64_monotonic()),
+        DynamicColumn::IpAddr(_) | DynamicColumn::Bytes(_) | DynamicColumn::Str(_) => None,
+    }
+}
+
+fn merge_column(
+    column_type: ColumnType,
+    num_docs_per_column: &[u32],
+    columns: Vec<Option<DynamicColumn>>,
+    merge_row_order: &MergeRowOrder,
+    wrt: &mut impl io::Write,
+) -> io::Result<()> {
+    match column_type {
+        ColumnType::I64
+        | ColumnType::U64
+        | ColumnType::F64
+        | ColumnType::DateTime
+        | ColumnType::Bool => {
+            let mut column_indexes: Vec<ColumnIndex> = Vec::with_capacity(columns.len());
+            let mut column_values: Vec<Option<Arc<dyn ColumnValues>>> =
+                Vec::with_capacity(columns.len());
+            for (i, dynamic_column_opt) in columns.into_iter().enumerate() {
+                if let Some(Column { index: idx, values }) =
+                    dynamic_column_opt.and_then(dynamic_column_to_u64_monotonic)
+                {
+                    column_indexes.push(idx);
+                    column_values.push(Some(values));
+                } else {
+                    column_indexes.push(ColumnIndex::Empty {
+                        num_docs: num_docs_per_column[i],
+                    });
+                    column_values.push(None);
+                }
+            }
+            let merged_column_index =
+                crate::column_index::merge_column_index(&column_indexes[..], merge_row_order);
+            let merge_column_values = MergedColumnValues {
+                column_indexes: &column_indexes[..],
+                column_values: &column_values[..],
+                merge_row_order,
+            };
+            serialize_column_mappable_to_u64(merged_column_index, &merge_column_values, wrt)?;
+        }
+        ColumnType::IpAddr => {
+            let mut column_indexes: Vec<ColumnIndex> = Vec::with_capacity(columns.len());
+            let mut column_values: Vec<Option<Arc<dyn ColumnValues<Ipv6Addr>>>> =
+                Vec::with_capacity(columns.len());
+            for (i, dynamic_column_opt) in columns.into_iter().enumerate() {
+                if let Some(DynamicColumn::IpAddr(Column { index: idx, values })) =
+                    dynamic_column_opt
+                {
+                    column_indexes.push(idx);
+                    column_values.push(Some(values));
+                } else {
+                    column_indexes.push(ColumnIndex::Empty {
+                        num_docs: num_docs_per_column[i],
+                    });
+                    column_values.push(None);
+                }
+            }
+
+            let merged_column_index =
+                crate::column_index::merge_column_index(&column_indexes[..], merge_row_order);
+            let merge_column_values = MergedColumnValues {
+                column_indexes: &column_indexes[..],
+                column_values: &column_values,
+                merge_row_order,
+            };
+
+            serialize_column_mappable_to_u128(merged_column_index, &merge_column_values, wrt)?;
+        }
+        ColumnType::Bytes | ColumnType::Str => {
+            let mut column_indexes: Vec<ColumnIndex> = Vec::with_capacity(columns.len());
+            let mut bytes_columns: Vec<Option<BytesColumn>> = Vec::with_capacity(columns.len());
+            for (i, dynamic_column_opt) in columns.into_iter().enumerate() {
+                match dynamic_column_opt {
+                    Some(DynamicColumn::Str(str_column)) => {
+                        column_indexes.push(str_column.term_ord_column.index.clone());
+                        bytes_columns.push(Some(str_column.into()));
+                    }
+                    Some(DynamicColumn::Bytes(bytes_column)) => {
+                        column_indexes.push(bytes_column.term_ord_column.index.clone());
+                        bytes_columns.push(Some(bytes_column));
+                    }
+                    _ => {
+                        column_indexes.push(ColumnIndex::Empty {
+                            num_docs: num_docs_per_column[i],
+                        });
+                        bytes_columns.push(None);
+                    }
+                }
+            }
+            let merged_column_index =
+                crate::column_index::merge_column_index(&column_indexes[..], merge_row_order);
+            merge_bytes_or_str_column(merged_column_index, &bytes_columns, merge_row_order, wrt)?;
+        }
+    }
+    Ok(())
+}
+
+struct GroupedColumns {
+    required_column_type: Option<ColumnType>,
+    columns: Vec<Option<DynamicColumn>>,
+}
+
+impl GroupedColumns {
+    /// Check is column group can be skipped during serialization.
+    fn is_empty(&self) -> bool {
+        self.required_column_type.is_none() && self.columns.iter().all(Option::is_none)
+    }
+
+    /// Returns the column type after merge.
+    ///
+    /// This method does not check if the column types can actually be coerced to
+    /// this type.
+    fn column_type_after_merge(&self) -> ColumnType {
+        if let Some(required_type) = self.required_column_type {
+            return required_type;
+        }
+        let column_type: HashSet<ColumnType> = self
+            .columns
+            .iter()
+            .flatten()
+            .map(|column| column.column_type())
+            .collect();
+        if column_type.len() == 1 {
+            return column_type.into_iter().next().unwrap();
+        }
+        // At the moment, only the numerical categorical column type has more than one possible
+        // column type.
+        assert!(self
+            .columns
+            .iter()
+            .flatten()
+            .all(|el| ColumnTypeCategory::from(el.column_type()) == ColumnTypeCategory::Numerical));
+        merged_numerical_columns_type(self.columns.iter().flatten()).into()
+    }
+}
+
+struct GroupedColumnsHandle {
+    required_column_type: Option<ColumnType>,
+    columns: Vec<Option<DynamicColumnHandle>>,
+}
+
+impl GroupedColumnsHandle {
+    fn new(num_columnars: usize) -> Self {
+        GroupedColumnsHandle {
+            required_column_type: None,
+            columns: vec![None; num_columnars],
+        }
+    }
+    fn open(self, merge_row_order: &MergeRowOrder) -> io::Result<GroupedColumns> {
+        let mut columns: Vec<Option<DynamicColumn>> = Vec::new();
+        for (columnar_id, column) in self.columns.iter().enumerate() {
+            if let Some(column) = column {
+                let column = column.open()?;
+                // We skip columns that end up with 0 documents.
+                // That way, we make sure they don't end up influencing the merge type or
+                // creating empty columns.
+
+                if is_empty_after_merge(merge_row_order, &column, columnar_id) {
+                    columns.push(None);
+                } else {
+                    columns.push(Some(column));
+                }
+            } else {
+                columns.push(None);
+            }
+        }
+        Ok(GroupedColumns {
+            required_column_type: self.required_column_type,
+            columns,
+        })
+    }
+
+    /// Set the dynamic column for a given columnar.
+    fn set_column(&mut self, columnar_id: usize, column: DynamicColumnHandle) {
+        self.columns[columnar_id] = Some(column);
+    }
+
+    /// Force the existence of a column, as well as its type.
+    fn require_type(&mut self, required_type: ColumnType) -> io::Result<()> {
+        if let Some(existing_required_type) = self.required_column_type {
+            if existing_required_type == required_type {
+                // This was just a duplicate in the `required_columns`.
+                // Nothing to do.
+                return Ok(());
+            } else {
+                return Err(io::Error::new(
+                    io::ErrorKind::InvalidInput,
+                    "Required column conflicts with another required column of the same type \
+                     category.",
+                ));
+            }
+        }
+        self.required_column_type = Some(required_type);
+        Ok(())
+    }
+}
+
+/// Returns the type of the merged numerical column.
+///
+/// This function picks the first numerical type out of i64, u64, f64 (order matters
+/// here), that is compatible with all the `columns`.
+///
+/// # Panics
+/// Panics if one of the column is not numerical.
+fn merged_numerical_columns_type<'a>(
+    columns: impl Iterator<Item = &'a DynamicColumn>,
+) -> NumericalType {
+    let mut compatible_numerical_types = CompatibleNumericalTypes::default();
+    for column in columns {
+        let (min_value, max_value) =
+            min_max_if_numerical(column).expect("All columns re required to be numerical");
+        compatible_numerical_types.accept_value(min_value);
+        compatible_numerical_types.accept_value(max_value);
+    }
+    compatible_numerical_types.to_numerical_type()
+}
+
+fn is_empty_after_merge(
+    merge_row_order: &MergeRowOrder,
+    column: &DynamicColumn,
+    columnar_ord: usize,
+) -> bool {
+    if column.num_values() == 0u32 {
+        // It was empty before the merge.
+        return true;
+    }
+    match merge_row_order {
+        MergeRowOrder::Stack(_) => {
+            // If we are stacking the columnar, no rows are being deleted.
+            false
+        }
+        MergeRowOrder::Shuffled(shuffled) => {
+            if let Some(alive_bitset) = &shuffled.alive_bitsets[columnar_ord] {
+                let column_index = column.column_index();
+                match column_index {
+                    ColumnIndex::Empty { .. } => true,
+                    ColumnIndex::Full => alive_bitset.len() == 0,
+                    ColumnIndex::Optional(optional_index) => {
+                        for doc in optional_index.iter_rows() {
+                            if alive_bitset.contains(doc) {
+                                return false;
+                            }
+                        }
+                        true
+                    }
+                    ColumnIndex::Multivalued(multivalued_index) => {
+                        for (doc_id, (start_index, end_index)) in multivalued_index
+                            .start_index_column
+                            .iter()
+                            .tuple_windows()
+                            .enumerate()
+                        {
+                            let doc_id = doc_id as u32;
+                            if start_index == end_index {
+                                // There are no values in this document
+                                continue;
+                            }
+                            // The document contains values and is present in the alive bitset.
+                            // The column is therefore not empty.
+                            if alive_bitset.contains(doc_id) {
+                                return false;
+                            }
+                        }
+                        true
+                    }
+                }
+            } else {
+                // No document is being deleted.
+                // The shuffle is applying a permutation.
+                false
+            }
+        }
+    }
+}
+
+/// Iterates over the columns of the columnar readers, grouped by column name.
+/// Key functionality is that `open` of the Columns is done lazy per group.
+fn group_columns_for_merge<'a>(
+    columnar_readers: &'a [&'a ColumnarReader],
+    required_columns: &'a [(String, ColumnType)],
+    _merge_row_order: &'a MergeRowOrder,
+) -> io::Result<BTreeMap<(String, ColumnTypeCategory), GroupedColumnsHandle>> {
+    let mut columns: BTreeMap<(String, ColumnTypeCategory), GroupedColumnsHandle> = BTreeMap::new();
+
+    for &(ref column_name, column_type) in required_columns {
+        columns
+            .entry((column_name.clone(), column_type.into()))
+            .or_insert_with(|| GroupedColumnsHandle::new(columnar_readers.len()))
+            .require_type(column_type)?;
+    }
+
+    for (columnar_id, columnar_reader) in columnar_readers.iter().enumerate() {
+        let column_name_and_handle = columnar_reader.iter_columns()?;
+
+        for (column_name, handle) in column_name_and_handle {
+            let column_category: ColumnTypeCategory = handle.column_type().into();
+            columns
+                .entry((column_name, column_category))
+                .or_insert_with(|| GroupedColumnsHandle::new(columnar_readers.len()))
+                .set_column(columnar_id, handle);
+        }
+    }
+    Ok(columns)
+}
+
+fn coerce_columns(
+    column_type: ColumnType,
+    columns: &mut [Option<DynamicColumn>],
+) -> io::Result<()> {
+    for column_opt in columns.iter_mut() {
+        if let Some(column) = column_opt.take() {
+            *column_opt = Some(coerce_column(column_type, column)?);
+        }
+    }
+    Ok(())
+}
+
+fn coerce_column(column_type: ColumnType, column: DynamicColumn) -> io::Result<DynamicColumn> {
+    if let Some(numerical_type) = column_type.numerical_type() {
+        column
+            .coerce_numerical(numerical_type)
+            .ok_or_else(|| io::Error::new(io::ErrorKind::InvalidInput, ""))
+    } else {
+        if column.column_type() != column_type {
+            return Err(io::Error::new(
+                io::ErrorKind::InvalidInput,
+                format!(
+                    "Cannot coerce column of type `{:?}` to `{column_type:?}`",
+                    column.column_type()
+                ),
+            ));
+        }
+        Ok(column)
+    }
+}
+
+/// Returns the (min, max) of a column provided it is numerical (i64, u64. f64).
+///
+/// The min and the max are simply the numerical value as defined by `ColumnValue::min_value()`,
+/// and `ColumnValue::max_value()`.
+///
+/// It is important to note that these values are only guaranteed to be lower/upper bound
+/// (as opposed to min/max value).
+/// If a column is empty, the min and max values are currently set to 0.
+fn min_max_if_numerical(column: &DynamicColumn) -> Option<(NumericalValue, NumericalValue)> {
+    match column {
+        DynamicColumn::I64(column) => Some((column.min_value().into(), column.max_value().into())),
+        DynamicColumn::U64(column) => Some((column.min_value().into(), column.max_value().into())),
+        DynamicColumn::F64(column) => Some((column.min_value().into(), column.max_value().into())),
+        DynamicColumn::Bool(_)
+        | DynamicColumn::IpAddr(_)
+        | DynamicColumn::DateTime(_)
+        | DynamicColumn::Bytes(_)
+        | DynamicColumn::Str(_) => None,
+    }
+}
+
+#[cfg(test)]
+mod tests;
--- a/columnar/src/columnar/merge/term_merger.rs
+++ b/columnar/src/columnar/merge/term_merger.rs
@@ -0,0 +1,107 @@
+use std::cmp::Ordering;
+use std::collections::BinaryHeap;
+
+use sstable::TermOrdinal;
+
+use crate::Streamer;
+
+pub struct HeapItem<'a> {
+    pub streamer: Streamer<'a>,
+    pub segment_ord: usize,
+}
+
+impl<'a> PartialEq for HeapItem<'a> {
+    fn eq(&self, other: &Self) -> bool {
+        self.segment_ord == other.segment_ord
+    }
+}
+
+impl<'a> Eq for HeapItem<'a> {}
+
+impl<'a> PartialOrd for HeapItem<'a> {
+    fn partial_cmp(&self, other: &HeapItem<'a>) -> Option<Ordering> {
+        Some(self.cmp(other))
+    }
+}
+
+impl<'a> Ord for HeapItem<'a> {
+    fn cmp(&self, other: &HeapItem<'a>) -> Ordering {
+        (&other.streamer.key(), &other.segment_ord).cmp(&(&self.streamer.key(), &self.segment_ord))
+    }
+}
+
+/// Given a list of sorted term streams,
+/// returns an iterator over sorted unique terms.
+///
+/// The item yield is actually a pair with
+/// - the term
+/// - a slice with the ordinal of the segments containing
+/// the terms.
+pub struct TermMerger<'a> {
+    heap: BinaryHeap<HeapItem<'a>>,
+    current_streamers: Vec<HeapItem<'a>>,
+}
+
+impl<'a> TermMerger<'a> {
+    /// Stream of merged term dictionary
+    pub fn new(streams: Vec<Streamer<'a>>) -> TermMerger<'a> {
+        TermMerger {
+            heap: BinaryHeap::new(),
+            current_streamers: streams
+                .into_iter()
+                .enumerate()
+                .map(|(ord, streamer)| HeapItem {
+                    streamer,
+                    segment_ord: ord,
+                })
+                .collect(),
+        }
+    }
+
+    pub(crate) fn matching_segments<'b: 'a>(
+        &'b self,
+    ) -> impl 'b + Iterator<Item = (usize, TermOrdinal)> {
+        self.current_streamers
+            .iter()
+            .map(|heap_item| (heap_item.segment_ord, heap_item.streamer.term_ord()))
+    }
+
+    fn advance_segments(&mut self) {
+        let streamers = &mut self.current_streamers;
+        let heap = &mut self.heap;
+        for mut heap_item in streamers.drain(..) {
+            if heap_item.streamer.advance() {
+                heap.push(heap_item);
+            }
+        }
+    }
+
+    /// Advance the term iterator to the next term.
+    /// Returns true if there is indeed another term
+    /// False if there is none.
+    pub fn advance(&mut self) -> bool {
+        self.advance_segments();
+        if let Some(head) = self.heap.pop() {
+            self.current_streamers.push(head);
+            while let Some(next_streamer) = self.heap.peek() {
+                if self.current_streamers[0].streamer.key() != next_streamer.streamer.key() {
+                    break;
+                }
+                let next_heap_it = self.heap.pop().unwrap(); // safe : we peeked beforehand
+                self.current_streamers.push(next_heap_it);
+            }
+            true
+        } else {
+            false
+        }
+    }
+
+    /// Returns the current term.
+    ///
+    /// This method may be called
+    /// if and only if advance() has been called before
+    /// and "true" was returned.
+    pub fn key(&self) -> &[u8] {
+        self.current_streamers[0].streamer.key()
+    }
+}
--- a/columnar/src/columnar/merge/tests.rs
+++ b/columnar/src/columnar/merge/tests.rs
@@ -0,0 +1,496 @@
+use std::collections::BTreeMap;
+
+use itertools::Itertools;
+
+use super::*;
+use crate::{Cardinality, ColumnarWriter, HasAssociatedColumnType, RowId};
+
+fn make_columnar<T: Into<NumericalValue> + HasAssociatedColumnType + Copy>(
+    column_name: &str,
+    vals: &[T],
+) -> ColumnarReader {
+    let mut dataframe_writer = ColumnarWriter::default();
+    dataframe_writer.record_column_type(column_name, T::column_type(), false);
+    for (row_id, val) in vals.iter().copied().enumerate() {
+        dataframe_writer.record_numerical(row_id as RowId, column_name, val.into());
+    }
+    let mut buffer: Vec<u8> = Vec::new();
+    dataframe_writer
+        .serialize(vals.len() as RowId, None, &mut buffer)
+        .unwrap();
+    ColumnarReader::open(buffer).unwrap()
+}
+
+#[test]
+fn test_column_coercion_to_u64() {
+    // i64 type
+    let columnar1 = make_columnar("numbers", &[1i64]);
+    // u64 type
+    let columnar2 = make_columnar("numbers", &[u64::MAX]);
+    let columnars = &[&columnar1, &columnar2];
+    let merge_order = StackMergeOrder::stack(columnars).into();
+    let column_map: BTreeMap<(String, ColumnTypeCategory), GroupedColumnsHandle> =
+        group_columns_for_merge(columnars, &[], &merge_order).unwrap();
+    assert_eq!(column_map.len(), 1);
+    assert!(column_map.contains_key(&("numbers".to_string(), ColumnTypeCategory::Numerical)));
+}
+
+#[test]
+fn test_column_coercion_to_i64() {
+    let columnar1 = make_columnar("numbers", &[-1i64]);
+    let columnar2 = make_columnar("numbers", &[2u64]);
+    let columnars = &[&columnar1, &columnar2];
+    let merge_order = StackMergeOrder::stack(columnars).into();
+    let column_map: BTreeMap<(String, ColumnTypeCategory), GroupedColumnsHandle> =
+        group_columns_for_merge(columnars, &[], &merge_order).unwrap();
+    assert_eq!(column_map.len(), 1);
+    assert!(column_map.contains_key(&("numbers".to_string(), ColumnTypeCategory::Numerical)));
+}
+
+//#[test]
+// fn test_impossible_coercion_returns_an_error() {
+// let columnar1 = make_columnar("numbers", &[u64::MAX]);
+// let merge_order = StackMergeOrder::stack(&[&columnar1]).into();
+// let group_error = group_columns_for_merge_iter(
+//&[&columnar1],
+//&[("numbers".to_string(), ColumnType::I64)],
+//&merge_order,
+//)
+//.unwrap_err();
+// assert_eq!(group_error.kind(), io::ErrorKind::InvalidInput);
+//}
+
+#[test]
+fn test_group_columns_with_required_column() {
+    let columnar1 = make_columnar("numbers", &[1i64]);
+    let columnar2 = make_columnar("numbers", &[2u64]);
+    let columnars = &[&columnar1, &columnar2];
+    let merge_order = StackMergeOrder::stack(columnars).into();
+    let column_map: BTreeMap<(String, ColumnTypeCategory), GroupedColumnsHandle> =
+        group_columns_for_merge(
+            &[&columnar1, &columnar2],
+            &[("numbers".to_string(), ColumnType::U64)],
+            &merge_order,
+        )
+        .unwrap();
+    assert_eq!(column_map.len(), 1);
+    assert!(column_map.contains_key(&("numbers".to_string(), ColumnTypeCategory::Numerical)));
+}
+
+#[test]
+fn test_group_columns_required_column_with_no_existing_columns() {
+    let columnar1 = make_columnar("numbers", &[2u64]);
+    let columnar2 = make_columnar("numbers", &[2u64]);
+    let columnars = &[&columnar1, &columnar2];
+    let merge_order = StackMergeOrder::stack(columnars).into();
+    let column_map: BTreeMap<_, _> = group_columns_for_merge(
+        columnars,
+        &[("required_col".to_string(), ColumnType::Str)],
+        &merge_order,
+    )
+    .unwrap();
+    assert_eq!(column_map.len(), 2);
+    let columns = &column_map
+        .get(&("required_col".to_string(), ColumnTypeCategory::Str))
+        .unwrap()
+        .columns;
+    assert_eq!(columns.len(), 2);
+    assert!(columns[0].is_none());
+    assert!(columns[1].is_none());
+}
+
+#[test]
+fn test_group_columns_required_column_is_above_all_columns_have_the_same_type_rule() {
+    let columnar1 = make_columnar("numbers", &[2i64]);
+    let columnar2 = make_columnar("numbers", &[2i64]);
+    let columnars = &[&columnar1, &columnar2];
+    let merge_order = StackMergeOrder::stack(columnars).into();
+    let column_map: BTreeMap<(String, ColumnTypeCategory), GroupedColumnsHandle> =
+        group_columns_for_merge(
+            columnars,
+            &[("numbers".to_string(), ColumnType::U64)],
+            &merge_order,
+        )
+        .unwrap();
+    assert_eq!(column_map.len(), 1);
+    assert!(column_map.contains_key(&("numbers".to_string(), ColumnTypeCategory::Numerical)));
+}
+
+#[test]
+fn test_missing_column() {
+    let columnar1 = make_columnar("numbers", &[-1i64]);
+    let columnar2 = make_columnar("numbers2", &[2u64]);
+    let columnars = &[&columnar1, &columnar2];
+    let merge_order = StackMergeOrder::stack(columnars).into();
+    let column_map: BTreeMap<(String, ColumnTypeCategory), GroupedColumnsHandle> =
+        group_columns_for_merge(columnars, &[], &merge_order).unwrap();
+    assert_eq!(column_map.len(), 2);
+    assert!(column_map.contains_key(&("numbers".to_string(), ColumnTypeCategory::Numerical)));
+    {
+        let columns = &column_map
+            .get(&("numbers".to_string(), ColumnTypeCategory::Numerical))
+            .unwrap()
+            .columns;
+        assert!(columns[0].is_some());
+        assert!(columns[1].is_none());
+    }
+    {
+        let columns = &column_map
+            .get(&("numbers2".to_string(), ColumnTypeCategory::Numerical))
+            .unwrap()
+            .columns;
+        assert!(columns[0].is_none());
+        assert!(columns[1].is_some());
+    }
+}
+
+fn make_numerical_columnar_multiple_columns(
+    columns: &[(&str, &[&[NumericalValue]])],
+) -> ColumnarReader {
+    let mut dataframe_writer = ColumnarWriter::default();
+    for (column_name, column_values) in columns {
+        for (row_id, vals) in column_values.iter().enumerate() {
+            for val in vals.iter() {
+                dataframe_writer.record_numerical(row_id as u32, column_name, *val);
+            }
+        }
+    }
+    let num_rows = columns
+        .iter()
+        .map(|(_, val_rows)| val_rows.len() as RowId)
+        .max()
+        .unwrap_or(0u32);
+    let mut buffer: Vec<u8> = Vec::new();
+    dataframe_writer
+        .serialize(num_rows, None, &mut buffer)
+        .unwrap();
+    ColumnarReader::open(buffer).unwrap()
+}
+
+#[track_caller]
+fn make_byte_columnar_multiple_columns(
+    columns: &[(&str, &[&[&[u8]]])],
+    num_rows: u32,
+) -> ColumnarReader {
+    let mut dataframe_writer = ColumnarWriter::default();
+    for (column_name, column_values) in columns {
+        assert_eq!(
+            column_values.len(),
+            num_rows as usize,
+            "All columns must have `{num_rows}` rows"
+        );
+        for (row_id, vals) in column_values.iter().enumerate() {
+            for val in vals.iter() {
+                dataframe_writer.record_bytes(row_id as u32, column_name, val);
+            }
+        }
+    }
+    let mut buffer: Vec<u8> = Vec::new();
+    dataframe_writer
+        .serialize(num_rows, None, &mut buffer)
+        .unwrap();
+    ColumnarReader::open(buffer).unwrap()
+}
+
+fn make_text_columnar_multiple_columns(columns: &[(&str, &[&[&str]])]) -> ColumnarReader {
+    let mut dataframe_writer = ColumnarWriter::default();
+    for (column_name, column_values) in columns {
+        for (row_id, vals) in column_values.iter().enumerate() {
+            for val in vals.iter() {
+                dataframe_writer.record_str(row_id as u32, column_name, val);
+            }
+        }
+    }
+    let num_rows = columns
+        .iter()
+        .map(|(_, val_rows)| val_rows.len() as RowId)
+        .max()
+        .unwrap_or(0u32);
+    let mut buffer: Vec<u8> = Vec::new();
+    dataframe_writer
+        .serialize(num_rows, None, &mut buffer)
+        .unwrap();
+    ColumnarReader::open(buffer).unwrap()
+}
+
+#[test]
+fn test_merge_columnar_numbers() {
+    let columnar1 =
+        make_numerical_columnar_multiple_columns(&[("numbers", &[&[NumericalValue::from(-1f64)]])]);
+    let columnar2 = make_numerical_columnar_multiple_columns(&[(
+        "numbers",
+        &[&[], &[NumericalValue::from(-3f64)]],
+    )]);
+    let mut buffer = Vec::new();
+    let columnars = &[&columnar1, &columnar2];
+    let stack_merge_order = StackMergeOrder::stack(columnars);
+    crate::columnar::merge_columnar(
+        columnars,
+        &[],
+        MergeRowOrder::Stack(stack_merge_order),
+        &mut buffer,
+    )
+    .unwrap();
+    let columnar_reader = ColumnarReader::open(buffer).unwrap();
+    assert_eq!(columnar_reader.num_rows(), 3);
+    assert_eq!(columnar_reader.num_columns(), 1);
+    let cols = columnar_reader.read_columns("numbers").unwrap();
+    let dynamic_column = cols[0].open().unwrap();
+    let DynamicColumn::F64(vals) = dynamic_column else {
+        panic!()
+    };
+    assert_eq!(vals.get_cardinality(), Cardinality::Optional);
+    assert_eq!(vals.first(0u32), Some(-1f64));
+    assert_eq!(vals.first(1u32), None);
+    assert_eq!(vals.first(2u32), Some(-3f64));
+}
+
+#[test]
+fn test_merge_columnar_texts() {
+    let columnar1 = make_text_columnar_multiple_columns(&[("texts", &[&["a"]])]);
+    let columnar2 = make_text_columnar_multiple_columns(&[("texts", &[&[], &["b"]])]);
+    let mut buffer = Vec::new();
+    let columnars = &[&columnar1, &columnar2];
+    let stack_merge_order = StackMergeOrder::stack(columnars);
+    crate::columnar::merge_columnar(
+        columnars,
+        &[],
+        MergeRowOrder::Stack(stack_merge_order),
+        &mut buffer,
+    )
+    .unwrap();
+    let columnar_reader = ColumnarReader::open(buffer).unwrap();
+    assert_eq!(columnar_reader.num_rows(), 3);
+    assert_eq!(columnar_reader.num_columns(), 1);
+    let cols = columnar_reader.read_columns("texts").unwrap();
+    let dynamic_column = cols[0].open().unwrap();
+    let DynamicColumn::Str(vals) = dynamic_column else {
+        panic!()
+    };
+    assert_eq!(vals.ords().get_cardinality(), Cardinality::Optional);
+
+    let get_str_for_ord = |ord| {
+        let mut out = String::new();
+        vals.ord_to_str(ord, &mut out).unwrap();
+        out
+    };
+
+    assert_eq!(vals.dictionary.num_terms(), 2);
+    assert_eq!(get_str_for_ord(0), "a");
+    assert_eq!(get_str_for_ord(1), "b");
+
+    let get_str_for_row = |row_id| {
+        let term_ords: Vec<u64> = vals.term_ords(row_id).collect();
+        assert!(term_ords.len() <= 1);
+        let mut out = String::new();
+        if term_ords.len() == 1 {
+            vals.ord_to_str(term_ords[0], &mut out).unwrap();
+        }
+        out
+    };
+
+    assert_eq!(get_str_for_row(0), "a");
+    assert_eq!(get_str_for_row(1), "");
+    assert_eq!(get_str_for_row(2), "b");
+}
+
+#[test]
+fn test_merge_columnar_byte() {
+    let columnar1 = make_byte_columnar_multiple_columns(&[("bytes", &[&[b"bbbb"], &[b"baaa"]])], 2);
+    let columnar2 = make_byte_columnar_multiple_columns(&[("bytes", &[&[], &[b"a"]])], 2);
+    let mut buffer = Vec::new();
+    let columnars = &[&columnar1, &columnar2];
+    let stack_merge_order = StackMergeOrder::stack(columnars);
+    crate::columnar::merge_columnar(
+        columnars,
+        &[],
+        MergeRowOrder::Stack(stack_merge_order),
+        &mut buffer,
+    )
+    .unwrap();
+    let columnar_reader = ColumnarReader::open(buffer).unwrap();
+    assert_eq!(columnar_reader.num_rows(), 4);
+    assert_eq!(columnar_reader.num_columns(), 1);
+    let cols = columnar_reader.read_columns("bytes").unwrap();
+    let dynamic_column = cols[0].open().unwrap();
+    let DynamicColumn::Bytes(vals) = dynamic_column else {
+        panic!()
+    };
+    let get_bytes_for_ord = |ord| {
+        let mut out = Vec::new();
+        vals.ord_to_bytes(ord, &mut out).unwrap();
+        out
+    };
+
+    assert_eq!(vals.dictionary.num_terms(), 3);
+    assert_eq!(get_bytes_for_ord(0), b"a");
+    assert_eq!(get_bytes_for_ord(1), b"baaa");
+    assert_eq!(get_bytes_for_ord(2), b"bbbb");
+
+    let get_bytes_for_row = |row_id| {
+        let term_ords: Vec<u64> = vals.term_ords(row_id).collect();
+        assert!(term_ords.len() <= 1);
+        let mut out = Vec::new();
+        if term_ords.len() == 1 {
+            vals.ord_to_bytes(term_ords[0], &mut out).unwrap();
+        }
+        out
+    };
+
+    assert_eq!(get_bytes_for_row(0), b"bbbb");
+    assert_eq!(get_bytes_for_row(1), b"baaa");
+    assert_eq!(get_bytes_for_row(2), b"");
+    assert_eq!(get_bytes_for_row(3), b"a");
+}
+
+#[test]
+fn test_merge_columnar_byte_with_missing() {
+    let columnar1 = make_byte_columnar_multiple_columns(&[], 3);
+    let columnar2 = make_byte_columnar_multiple_columns(&[("col", &[&[b"b"], &[]])], 2);
+    let columnar3 = make_byte_columnar_multiple_columns(
+        &[
+            ("col", &[&[], &[b"b"], &[b"a", b"b"]]),
+            ("col2", &[&[b"hello"], &[], &[b"a", b"b"]]),
+        ],
+        3,
+    );
+    let mut buffer = Vec::new();
+    let columnars = &[&columnar1, &columnar2, &columnar3];
+    let stack_merge_order = StackMergeOrder::stack(columnars);
+    crate::columnar::merge_columnar(
+        columnars,
+        &[],
+        MergeRowOrder::Stack(stack_merge_order),
+        &mut buffer,
+    )
+    .unwrap();
+    let columnar_reader = ColumnarReader::open(buffer).unwrap();
+    assert_eq!(columnar_reader.num_rows(), 3 + 2 + 3);
+    assert_eq!(columnar_reader.num_columns(), 2);
+    let cols = columnar_reader.read_columns("col").unwrap();
+    let dynamic_column = cols[0].open().unwrap();
+    let DynamicColumn::Bytes(vals) = dynamic_column else {
+        panic!()
+    };
+    let get_bytes_for_ord = |ord| {
+        let mut out = Vec::new();
+        vals.ord_to_bytes(ord, &mut out).unwrap();
+        out
+    };
+    assert_eq!(vals.dictionary.num_terms(), 2);
+    assert_eq!(get_bytes_for_ord(0), b"a");
+    assert_eq!(get_bytes_for_ord(1), b"b");
+    let get_bytes_for_row = |row_id| {
+        let terms: Vec<Vec<u8>> = vals
+            .term_ords(row_id)
+            .map(|term_ord| {
+                let mut out = Vec::new();
+                vals.ord_to_bytes(term_ord, &mut out).unwrap();
+                out
+            })
+            .collect();
+        terms
+    };
+    assert!(get_bytes_for_row(0).is_empty());
+    assert!(get_bytes_for_row(1).is_empty());
+    assert!(get_bytes_for_row(2).is_empty());
+    assert_eq!(get_bytes_for_row(3), vec![b"b".to_vec()]);
+    assert!(get_bytes_for_row(4).is_empty());
+    assert!(get_bytes_for_row(5).is_empty());
+    assert_eq!(get_bytes_for_row(6), vec![b"b".to_vec()]);
+    assert_eq!(get_bytes_for_row(7), vec![b"a".to_vec(), b"b".to_vec()]);
+}
+
+#[test]
+fn test_merge_columnar_different_types() {
+    let columnar1 = make_text_columnar_multiple_columns(&[("mixed", &[&["a"]])]);
+    let columnar2 = make_text_columnar_multiple_columns(&[("mixed", &[&[], &["b"]])]);
+    let columnar3 = make_columnar("mixed", &[1i64]);
+    let mut buffer = Vec::new();
+    let columnars = &[&columnar1, &columnar2, &columnar3];
+    let stack_merge_order = StackMergeOrder::stack(columnars);
+    crate::columnar::merge_columnar(
+        columnars,
+        &[],
+        MergeRowOrder::Stack(stack_merge_order),
+        &mut buffer,
+    )
+    .unwrap();
+    let columnar_reader = ColumnarReader::open(buffer).unwrap();
+    assert_eq!(columnar_reader.num_rows(), 4);
+    assert_eq!(columnar_reader.num_columns(), 2);
+    let cols = columnar_reader.read_columns("mixed").unwrap();
+
+    // numeric column
+    let dynamic_column = cols[0].open().unwrap();
+    let DynamicColumn::I64(vals) = dynamic_column else {
+        panic!()
+    };
+    assert_eq!(vals.get_cardinality(), Cardinality::Optional);
+    assert_eq!(vals.values_for_doc(0).collect_vec(), vec![]);
+    assert_eq!(vals.values_for_doc(1).collect_vec(), vec![]);
+    assert_eq!(vals.values_for_doc(2).collect_vec(), vec![]);
+    assert_eq!(vals.values_for_doc(3).collect_vec(), vec![1]);
+    assert_eq!(vals.values_for_doc(4).collect_vec(), vec![]);
+
+    // text column
+    let dynamic_column = cols[1].open().unwrap();
+    let DynamicColumn::Str(vals) = dynamic_column else {
+        panic!()
+    };
+    assert_eq!(vals.ords().get_cardinality(), Cardinality::Optional);
+    let get_str_for_ord = |ord| {
+        let mut out = String::new();
+        vals.ord_to_str(ord, &mut out).unwrap();
+        out
+    };
+
+    assert_eq!(vals.dictionary.num_terms(), 2);
+    assert_eq!(get_str_for_ord(0), "a");
+    assert_eq!(get_str_for_ord(1), "b");
+
+    let get_str_for_row = |row_id| {
+        let term_ords: Vec<String> = vals
+            .term_ords(row_id)
+            .map(|el| {
+                let mut out = String::new();
+                vals.ord_to_str(el, &mut out).unwrap();
+                out
+            })
+            .collect();
+        term_ords
+    };
+
+    assert_eq!(get_str_for_row(0), vec!["a".to_string()]);
+    assert_eq!(get_str_for_row(1), Vec::<String>::new());
+    assert_eq!(get_str_for_row(2), vec!["b".to_string()]);
+    assert_eq!(get_str_for_row(3), Vec::<String>::new());
+}
+
+#[test]
+fn test_merge_columnar_different_empty_cardinality() {
+    let columnar1 = make_text_columnar_multiple_columns(&[("mixed", &[&["a"]])]);
+    let columnar2 = make_columnar("mixed", &[1i64]);
+    let mut buffer = Vec::new();
+    let columnars = &[&columnar1, &columnar2];
+    let stack_merge_order = StackMergeOrder::stack(columnars);
+    crate::columnar::merge_columnar(
+        columnars,
+        &[],
+        MergeRowOrder::Stack(stack_merge_order),
+        &mut buffer,
+    )
+    .unwrap();
+    let columnar_reader = ColumnarReader::open(buffer).unwrap();
+    assert_eq!(columnar_reader.num_rows(), 2);
+    assert_eq!(columnar_reader.num_columns(), 2);
+    let cols = columnar_reader.read_columns("mixed").unwrap();
+
+    // numeric column
+    let dynamic_column = cols[0].open().unwrap();
+    assert_eq!(dynamic_column.get_cardinality(), Cardinality::Optional);
+
+    // text column
+    let dynamic_column = cols[1].open().unwrap();
+    assert_eq!(dynamic_column.get_cardinality(), Cardinality::Optional);
+}
--- a/columnar/src/columnar/mod.rs
+++ b/columnar/src/columnar/mod.rs
@@ -5,6 +5,8 @@ mod reader;
 mod writer;

 pub use column_type::{ColumnType, HasAssociatedColumnType};
-pub use merge::{merge_columnar, MergeDocOrder};
+#[cfg(test)]
+pub(crate) use merge::ColumnTypeCategory;
+pub use merge::{merge_columnar, MergeRowOrder, ShuffleMergeOrder, StackMergeOrder};
 pub use reader::ColumnarReader;
 pub use writer::ColumnarWriter;
--- a/columnar/src/columnar/reader/mod.rs
+++ b/columnar/src/columnar/reader/mod.rs
@@ -1,4 +1,4 @@
-use std::{io, mem};
+use std::{fmt, io, mem};

 use common::file_slice::FileSlice;
 use common::BinarySerializable;
@@ -6,6 +6,7 @@ use sstable::{Dictionary, RangeSSTable};

 use crate::columnar::{format_version, ColumnType};
 use crate::dynamic_column::DynamicColumnHandle;
+use crate::RowId;

 fn io_invalid_data(msg: String) -> io::Error {
    io::Error::new(io::ErrorKind::InvalidData, msg)
@@ -13,9 +14,63 @@ fn io_invalid_data(msg: String) -> io::Error {

 /// The ColumnarReader makes it possible to access a set of columns
 /// associated to field names.
+#[derive(Clone)]
 pub struct ColumnarReader {
    column_dictionary: Dictionary<RangeSSTable>,
    column_data: FileSlice,
+    num_rows: RowId,
+}
+
+impl fmt::Debug for ColumnarReader {
+    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+        let num_rows = self.num_rows();
+        let columns = self.list_columns().unwrap();
+        let num_cols = columns.len();
+        let mut debug_struct = f.debug_struct("Columnar");
+        debug_struct
+            .field("num_rows", &num_rows)
+            .field("num_cols", &num_cols);
+        for (col_name, dynamic_column_handle) in columns.into_iter().take(5) {
+            let col = dynamic_column_handle.open().unwrap();
+            if col.num_values() > 10 {
+                debug_struct.field(&col_name, &"..");
+            } else {
+                debug_struct.field(&col_name, &col);
+            }
+        }
+        if num_cols > 5 {
+            debug_struct.finish_non_exhaustive()?;
+        } else {
+            debug_struct.finish()?;
+        }
+        Ok(())
+    }
+}
+
+/// Functions by both the async/sync code listing columns.
+/// It takes a stream from the column sstable and return the list of
+/// `DynamicColumn` available in it.
+fn read_all_columns_in_stream(
+    mut stream: sstable::Streamer<'_, RangeSSTable>,
+    column_data: &FileSlice,
+) -> io::Result<Vec<DynamicColumnHandle>> {
+    let mut results = Vec::new();
+    while stream.advance() {
+        let key_bytes: &[u8] = stream.key();
+        let Some(column_code) = key_bytes.last().copied() else {
+            return Err(io_invalid_data("Empty column name.".to_string()));
+        };
+        let column_type = ColumnType::try_from_code(column_code)
+            .map_err(|_| io_invalid_data(format!("Unknown column code `{column_code}`")))?;
+        let range = stream.value();
+        let file_slice = column_data.slice(range.start as usize..range.end as usize);
+        let dynamic_column_handle = DynamicColumnHandle {
+            file_slice,
+            column_type,
+        };
+        results.push(dynamic_column_handle);
+    }
+    Ok(results)
 }

 impl ColumnarReader {
@@ -27,54 +82,64 @@ impl ColumnarReader {

    fn open_inner(file_slice: FileSlice) -> io::Result<ColumnarReader> {
        let (file_slice_without_sstable_len, footer_slice) = file_slice
-            .split_from_end(mem::size_of::<u64>() + format_version::VERSION_FOOTER_NUM_BYTES);
+            .split_from_end(mem::size_of::<u64>() + 4 + format_version::VERSION_FOOTER_NUM_BYTES);
        let footer_bytes = footer_slice.read_bytes()?;
-        let (mut sstable_len_bytes, version_footer_bytes) =
-            footer_bytes.rsplit(format_version::VERSION_FOOTER_NUM_BYTES);
+        let sstable_len = u64::deserialize(&mut &footer_bytes[0..8])?;
+        let num_rows = u32::deserialize(&mut &footer_bytes[8..12])?;
        let version_footer_bytes: [u8; format_version::VERSION_FOOTER_NUM_BYTES] =
-            version_footer_bytes.as_slice().try_into().unwrap();
+            footer_bytes[12..].try_into().unwrap();
        let _version = format_version::parse_footer(version_footer_bytes)?;
-        let sstable_len = u64::deserialize(&mut sstable_len_bytes)?;
        let (column_data, sstable) =
            file_slice_without_sstable_len.split_from_end(sstable_len as usize);
        let column_dictionary = Dictionary::open(sstable)?;
        Ok(ColumnarReader {
            column_dictionary,
            column_data,
+            num_rows,
        })
    }

+    pub fn num_rows(&self) -> RowId {
+        self.num_rows
+    }
+    // Iterate over the columns in a sorted way
+    pub fn iter_columns(
+        &self,
+    ) -> io::Result<impl Iterator<Item = (String, DynamicColumnHandle)> + '_> {
+        let mut stream = self.column_dictionary.stream()?;
+        Ok(std::iter::from_fn(move || {
+            if stream.advance() {
+                let key_bytes: &[u8] = stream.key();
+                let column_code: u8 = key_bytes.last().cloned().unwrap();
+                // TODO Error Handling. The API gets quite ugly when returning the error here, so
+                // instead we could just check the first N columns upfront.
+                let column_type: ColumnType = ColumnType::try_from_code(column_code)
+                    .map_err(|_| io_invalid_data(format!("Unknown column code `{column_code}`")))
+                    .unwrap();
+                let range = stream.value().clone();
+                let column_name =
+                // The last two bytes are respectively the 0u8 separator and the column_type.
+                String::from_utf8_lossy(&key_bytes[..key_bytes.len() - 2]).to_string();
+                let file_slice = self
+                    .column_data
+                    .slice(range.start as usize..range.end as usize);
+                let column_handle = DynamicColumnHandle {
+                    file_slice,
+                    column_type,
+                };
+                Some((column_name, column_handle))
+            } else {
+                None
+            }
+        }))
+    }
+
    // TODO Add unit tests
    pub fn list_columns(&self) -> io::Result<Vec<(String, DynamicColumnHandle)>> {
-        let mut stream = self.column_dictionary.stream()?;
-        let mut results = Vec::new();
-        while stream.advance() {
-            let key_bytes: &[u8] = stream.key();
-            let column_code: u8 = key_bytes.last().cloned().unwrap();
-            let column_type: ColumnType = ColumnType::try_from_code(column_code)
-                .map_err(|_| io_invalid_data(format!("Unknown column code `{column_code}`")))?;
-            let range = stream.value().clone();
-            let column_name =
-                // The last two bytes are respectively the 0u8 separator and the column_type.
-                String::from_utf8_lossy(&key_bytes[..key_bytes.len() - 2]).to_string();
-            let file_slice = self
-                .column_data
-                .slice(range.start as usize..range.end as usize);
-            let column_handle = DynamicColumnHandle {
-                file_slice,
-                column_type,
-            };
-            results.push((column_name, column_handle));
-        }
-        Ok(results)
+        Ok(self.iter_columns()?.collect())
    }

-    /// Get all columns for the given column name.
-    ///
-    /// There can be more than one column associated to a given column name, provided they have
-    /// different types.
-    // TODO fix ugly API
-    pub fn read_columns(&self, column_name: &str) -> io::Result<Vec<DynamicColumnHandle>> {
+    fn stream_for_column_range(&self, column_name: &str) -> sstable::StreamerBuilder<RangeSSTable> {
        // Each column is a associated to a given `column_key`,
        // that starts by `column_name\0column_header`.
        //
@@ -83,36 +148,35 @@ impl ColumnarReader {
        //
        // This is in turn equivalent to searching for the range
        // `[column_name,\0`..column_name\1)`.
-
-        // TODO can we get some more generic `prefix(..)` logic in the dictioanry.
+        // TODO can we get some more generic `prefix(..)` logic in the dictionary.
        let mut start_key = column_name.to_string();
        start_key.push('\0');
        let mut end_key = column_name.to_string();
        end_key.push(1u8 as char);
-        let mut stream = self
-            .column_dictionary
+        self.column_dictionary
            .range()
            .ge(start_key.as_bytes())
            .lt(end_key.as_bytes())
-            .into_stream()?;
-        let mut results = Vec::new();
-        while stream.advance() {
-            let key_bytes: &[u8] = stream.key();
-            assert!(key_bytes.starts_with(start_key.as_bytes()));
-            let column_code: u8 = key_bytes.last().cloned().unwrap();
-            let column_type = ColumnType::try_from_code(column_code)
-                .map_err(|_| io_invalid_data(format!("Unknown column code `{column_code}`")))?;
-            let range = stream.value().clone();
-            let file_slice = self
-                .column_data
-                .slice(range.start as usize..range.end as usize);
-            let dynamic_column_handle = DynamicColumnHandle {
-                file_slice,
-                column_type,
-            };
-            results.push(dynamic_column_handle);
-        }
-        Ok(results)
+    }
+
+    pub async fn read_columns_async(
+        &self,
+        column_name: &str,
+    ) -> io::Result<Vec<DynamicColumnHandle>> {
+        let stream = self
+            .stream_for_column_range(column_name)
+            .into_stream_async()
+            .await?;
+        read_all_columns_in_stream(stream, &self.column_data)
+    }
+
+    /// Get all columns for the given column name.
+    ///
+    /// There can be more than one column associated to a given column name, provided they have
+    /// different types.
+    pub fn read_columns(&self, column_name: &str) -> io::Result<Vec<DynamicColumnHandle>> {
+        let stream = self.stream_for_column_range(column_name).into_stream()?;
+        read_all_columns_in_stream(stream, &self.column_data)
    }

    /// Return the number of columns in the columnar.
@@ -120,3 +184,46 @@ impl ColumnarReader {
        self.column_dictionary.num_terms()
    }
 }
+
+#[cfg(test)]
+mod tests {
+    use crate::{ColumnType, ColumnarReader, ColumnarWriter};
+
+    #[test]
+    fn test_list_columns() {
+        let mut columnar_writer = ColumnarWriter::default();
+        columnar_writer.record_column_type("col1", ColumnType::Str, false);
+        columnar_writer.record_column_type("col2", ColumnType::U64, false);
+        let mut buffer = Vec::new();
+        columnar_writer.serialize(1, None, &mut buffer).unwrap();
+        let columnar = ColumnarReader::open(buffer).unwrap();
+        let columns = columnar.list_columns().unwrap();
+        assert_eq!(columns.len(), 2);
+        assert_eq!(&columns[0].0, "col1");
+        assert_eq!(columns[0].1.column_type(), ColumnType::Str);
+        assert_eq!(&columns[1].0, "col2");
+        assert_eq!(columns[1].1.column_type(), ColumnType::U64);
+    }
+
+    #[test]
+    fn test_list_columns_strict_typing_prevents_coercion() {
+        let mut columnar_writer = ColumnarWriter::default();
+        columnar_writer.record_column_type("count", ColumnType::U64, false);
+        columnar_writer.record_numerical(1, "count", 1u64);
+        let mut buffer = Vec::new();
+        columnar_writer.serialize(2, None, &mut buffer).unwrap();
+        let columnar = ColumnarReader::open(buffer).unwrap();
+        let columns = columnar.list_columns().unwrap();
+        assert_eq!(columns.len(), 1);
+        assert_eq!(&columns[0].0, "count");
+        assert_eq!(columns[0].1.column_type(), ColumnType::U64);
+    }
+
+    #[test]
+    #[should_panic(expected = "Input type forbidden")]
+    fn test_list_columns_strict_typing_panics_on_wrong_types() {
+        let mut columnar_writer = ColumnarWriter::default();
+        columnar_writer.record_column_type("count", ColumnType::U64, false);
+        columnar_writer.record_numerical(1, "count", 1i64);
+    }
+}
--- a/columnar/src/columnar/writer/column_operation.rs
+++ b/columnar/src/columnar/writer/column_operation.rs
@@ -310,7 +310,7 @@ mod tests {
        buffer.extend_from_slice(b"234234");
        let mut bytes = &buffer[..];
        let serdeser_symbol = ColumnOperation::deserialize(&mut bytes).unwrap();
-        assert_eq!(bytes.len() + buf.as_ref().len() as usize, buffer.len());
+        assert_eq!(bytes.len() + buf.as_ref().len(), buffer.len());
        assert_eq!(column_op, serdeser_symbol);
    }

@@ -341,7 +341,7 @@ mod tests {
    fn test_column_operation_unordered_aux(val: u32, expected_len: usize) {
        let column_op = ColumnOperation::Value(UnorderedId(val));
        let minibuf = column_op.serialize();
-        assert_eq!(minibuf.as_ref().len() as usize, expected_len);
+        assert_eq!({ minibuf.as_ref().len() }, expected_len);
        let mut buf = minibuf.as_ref().to_vec();
        buf.extend_from_slice(&[2, 2, 2, 2, 2, 2]);
        let mut cursor = &buf[..];
--- a/columnar/src/columnar/writer/column_writers.rs
+++ b/columnar/src/columnar/writer/column_writers.rs
@@ -41,10 +41,31 @@ impl ColumnWriter {
    pub(super) fn operation_iterator<'a, V: SymbolValue>(
        &self,
        arena: &MemoryArena,
+        old_to_new_ids_opt: Option<&[RowId]>,
        buffer: &'a mut Vec<u8>,
    ) -> impl Iterator<Item = ColumnOperation<V>> + 'a {
        buffer.clear();
        self.values.read_to_end(arena, buffer);
+        if let Some(old_to_new_ids) = old_to_new_ids_opt {
+            // TODO avoid the extra deserialization / serialization.
+            let mut sorted_ops: Vec<(RowId, ColumnOperation<V>)> = Vec::new();
+            let mut new_doc = 0u32;
+            let mut cursor = &buffer[..];
+            for op in std::iter::from_fn(|| ColumnOperation::<V>::deserialize(&mut cursor)) {
+                if let ColumnOperation::NewDoc(doc) = &op {
+                    new_doc = old_to_new_ids[*doc as usize];
+                    sorted_ops.push((new_doc, ColumnOperation::NewDoc(new_doc)));
+                } else {
+                    sorted_ops.push((new_doc, op));
+                }
+            }
+            // stable sort is crucial here.
+            sorted_ops.sort_by_key(|(new_doc_id, _)| *new_doc_id);
+            buffer.clear();
+            for (_, op) in sorted_ops {
+                buffer.extend_from_slice(op.serialize().as_ref());
+            }
+        }
        let mut cursor: &[u8] = &buffer[..];
        std::iter::from_fn(move || ColumnOperation::deserialize(&mut cursor))
    }
@@ -114,7 +135,7 @@ impl NumericalColumnWriter {
 /// State used to store what types are still acceptable
 /// after having seen a set of numerical values.
 #[derive(Clone, Copy)]
-enum CompatibleNumericalTypes {
+pub(crate) enum CompatibleNumericalTypes {
    Dynamic {
        all_values_within_i64_range: bool,
        all_values_within_u64_range: bool,
@@ -132,7 +153,7 @@ impl Default for CompatibleNumericalTypes {
 }

 impl CompatibleNumericalTypes {
-    fn is_type_accepted(&self, numerical_type: NumericalType) -> bool {
+    pub fn is_type_accepted(&self, numerical_type: NumericalType) -> bool {
        match self {
            CompatibleNumericalTypes::Dynamic {
                all_values_within_i64_range,
@@ -148,7 +169,7 @@ impl CompatibleNumericalTypes {
        }
    }

-    fn accept_value(&mut self, numerical_value: NumericalValue) {
+    pub fn accept_value(&mut self, numerical_value: NumericalValue) {
        match self {
            CompatibleNumericalTypes::Dynamic {
                all_values_within_i64_range,
@@ -168,7 +189,12 @@ impl CompatibleNumericalTypes {
                }
            },
            CompatibleNumericalTypes::StaticType(typ) => {
-                assert_eq!(numerical_value.numerical_type(), *typ);
+                assert_eq!(
+                    numerical_value.numerical_type(),
+                    *typ,
+                    "Input type forbidden. This column has been forced to type {typ:?}, received \
+                     {numerical_value:?}"
+                );
            }
        }
    }
@@ -205,9 +231,11 @@ impl NumericalColumnWriter {
    pub(super) fn operation_iterator<'a>(
        self,
        arena: &MemoryArena,
+        old_to_new_ids: Option<&[RowId]>,
        buffer: &'a mut Vec<u8>,
    ) -> impl Iterator<Item = ColumnOperation<NumericalValue>> + 'a {
-        self.column_writer.operation_iterator(arena, buffer)
+        self.column_writer
+            .operation_iterator(arena, old_to_new_ids, buffer)
    }
 }

@@ -215,6 +243,14 @@ impl NumericalColumnWriter {
 pub(crate) struct StrOrBytesColumnWriter {
    pub(crate) dictionary_id: u32,
    pub(crate) column_writer: ColumnWriter,
+    // If true, when facing a multivalued cardinality,
+    // values associated to a given document will be sorted.
+    //
+    // This is useful for facets.
+    //
+    // If false, the order of appearance in the document will be
+    // observed.
+    pub(crate) sort_values_within_row: bool,
 }

 impl StrOrBytesColumnWriter {
@@ -222,6 +258,7 @@ impl StrOrBytesColumnWriter {
        StrOrBytesColumnWriter {
            dictionary_id,
            column_writer: Default::default(),
+            sort_values_within_row: false,
        }
    }

@@ -239,9 +276,11 @@ impl StrOrBytesColumnWriter {
    pub(super) fn operation_iterator<'a>(
        &self,
        arena: &MemoryArena,
+        old_to_new_ids: Option<&[RowId]>,
        byte_buffer: &'a mut Vec<u8>,
    ) -> impl Iterator<Item = ColumnOperation<UnorderedId>> + 'a {
-        self.column_writer.operation_iterator(arena, byte_buffer)
+        self.column_writer
+            .operation_iterator(arena, old_to_new_ids, byte_buffer)
    }
 }

--- a/columnar/src/columnar/writer/mod.rs
+++ b/columnar/src/columnar/writer/mod.rs
@@ -7,8 +7,9 @@ use std::io;
 use std::net::Ipv6Addr;

 use column_operation::ColumnOperation;
+pub(crate) use column_writers::CompatibleNumericalTypes;
 use common::CountingWriter;
-use serializer::ColumnarSerializer;
+pub(crate) use serializer::ColumnarSerializer;
 use stacker::{Addr, ArenaHashMap, MemoryArena};

 use crate::column_index::SerializableColumnIndex;
@@ -29,10 +30,7 @@ use crate::{Cardinality, RowId};
 #[derive(Default)]
 struct SpareBuffers {
    value_index_builders: PreallocatedIndexBuilders,
-    i64_values: Vec<i64>,
    u64_values: Vec<u64>,
-    f64_values: Vec<f64>,
-    bool_values: Vec<bool>,
    ip_addr_values: Vec<Ipv6Addr>,
 }

@@ -47,8 +45,9 @@ struct SpareBuffers {
 /// columnar_writer.record_str(1u32 /* doc id */, "product_name", "Apple");
 /// columnar_writer.record_numerical(0u32 /* doc id */, "price", 10.5f64); //< uh oh we ended up mixing integer and floats.
 /// let mut wrt: Vec<u8> =  Vec::new();
-/// columnar_writer.serialize(2u32, &mut wrt).unwrap();
+/// columnar_writer.serialize(2u32, None, &mut wrt).unwrap();
 /// ```
+#[derive(Default)]
 pub struct ColumnarWriter {
    numerical_field_hash_map: ArenaHashMap,
    datetime_field_hash_map: ArenaHashMap,
@@ -62,22 +61,6 @@ pub struct ColumnarWriter {
    buffers: SpareBuffers,
 }

-impl Default for ColumnarWriter {
-    fn default() -> Self {
-        ColumnarWriter {
-            numerical_field_hash_map: ArenaHashMap::new(10_000),
-            bool_field_hash_map: ArenaHashMap::new(10_000),
-            ip_addr_field_hash_map: ArenaHashMap::new(10_000),
-            bytes_field_hash_map: ArenaHashMap::new(10_000),
-            str_field_hash_map: ArenaHashMap::new(10_000),
-            datetime_field_hash_map: ArenaHashMap::new(10_000),
-            dictionaries: Vec::new(),
-            arena: MemoryArena::default(),
-            buffers: SpareBuffers::default(),
-        }
-    }
-}
-
 #[inline]
 fn mutate_or_create_column<V, TMutator>(
    arena_hash_map: &mut ArenaHashMap,
@@ -96,7 +79,6 @@ fn mutate_or_create_column<V, TMutator>(

 impl ColumnarWriter {
    pub fn mem_usage(&self) -> usize {
-        // TODO add dictionary builders.
        self.arena.mem_usage()
            + self.numerical_field_hash_map.mem_usage()
            + self.bool_field_hash_map.mem_usage()
@@ -104,9 +86,87 @@ impl ColumnarWriter {
            + self.str_field_hash_map.mem_usage()
            + self.ip_addr_field_hash_map.mem_usage()
            + self.datetime_field_hash_map.mem_usage()
+            + self
+                .dictionaries
+                .iter()
+                .map(|dict| dict.mem_usage())
+                .sum::<usize>()
    }

-    pub fn record_column_type(&mut self, column_name: &str, column_type: ColumnType) {
+    /// Returns the list of doc ids from 0..num_docs sorted by the `sort_field`
+    /// column.
+    ///
+    /// If the column is multivalued, use the first value for scoring.
+    /// If no value is associated to a specific row, the document is assigned
+    /// the lowest possible score.
+    ///
+    /// The sort applied is stable.
+    pub fn sort_order(&self, sort_field: &str, num_docs: RowId, reversed: bool) -> Vec<u32> {
+        let Some(numerical_col_writer) = self
+            .numerical_field_hash_map
+            .get::<NumericalColumnWriter>(sort_field.as_bytes())
+            .or_else(|| {
+                self.datetime_field_hash_map
+                    .get::<NumericalColumnWriter>(sort_field.as_bytes())
+            })
+        else {
+            return Vec::new();
+        };
+        let mut symbols_buffer = Vec::new();
+        let mut values = Vec::new();
+        let mut start_doc_check_fill = 0;
+        let mut current_doc_opt: Option<RowId> = None;
+        // Assumption: NewDoc will never call the same doc twice and is strictly increasing between
+        // calls
+        for op in numerical_col_writer.operation_iterator(&self.arena, None, &mut symbols_buffer) {
+            match op {
+                ColumnOperation::NewDoc(doc) => {
+                    current_doc_opt = Some(doc);
+                }
+                ColumnOperation::Value(numerical_value) => {
+                    if let Some(current_doc) = current_doc_opt {
+                        // Fill up with 0.0 since last doc
+                        values.extend((start_doc_check_fill..current_doc).map(|doc| (0.0, doc)));
+                        start_doc_check_fill = current_doc + 1;
+                        // handle multi values
+                        current_doc_opt = None;
+
+                        let score: f32 = f64::coerce(numerical_value) as f32;
+                        values.push((score, current_doc));
+                    }
+                }
+            }
+        }
+        for doc in values.len() as u32..num_docs {
+            values.push((0.0f32, doc));
+        }
+        values.sort_by(|(left_score, _), (right_score, _)| {
+            if reversed {
+                right_score.total_cmp(left_score)
+            } else {
+                left_score.total_cmp(right_score)
+            }
+        });
+        values.into_iter().map(|(_score, doc)| doc).collect()
+    }
+
+    /// Records a column type. This is useful to bypass the coercion process,
+    /// makes sure the empty is present in the resulting columnar, or set
+    /// the `sort_values_within_row`.
+    ///
+    /// `sort_values_within_row` is only allowed for `Bytes` or `Str` columns.
+    pub fn record_column_type(
+        &mut self,
+        column_name: &str,
+        column_type: ColumnType,
+        sort_values_within_row: bool,
+    ) {
+        if sort_values_within_row {
+            assert!(
+                column_type == ColumnType::Bytes || column_type == ColumnType::Str,
+                "sort_values_within_row is only allowed for Bytes and Str columns",
+            );
+        }
        match column_type {
            ColumnType::Str | ColumnType::Bytes => {
                let (hash_map, dictionaries) = (
@@ -121,13 +181,15 @@ impl ColumnarWriter {
                    hash_map,
                    column_name,
                    |column_opt: Option<StrOrBytesColumnWriter>| {
-                        if let Some(column_writer) = column_opt {
+                        let mut column_writer = if let Some(column_writer) = column_opt {
                            column_writer
                        } else {
                            let dictionary_id = dictionaries.len() as u32;
                            dictionaries.push(DictionaryBuilder::default());
                            StrOrBytesColumnWriter::with_dictionary_id(dictionary_id)
-                        }
+                        };
+                        column_writer.sort_values_within_row = sort_values_within_row;
+                        column_writer
                    },
                );
            }
@@ -165,18 +227,6 @@ impl ColumnarWriter {
        }
    }

-    pub fn force_numerical_type(&mut self, column_name: &str, numerical_type: NumericalType) {
-        mutate_or_create_column(
-            &mut self.numerical_field_hash_map,
-            column_name,
-            |column_opt: Option<NumericalColumnWriter>| {
-                let mut column: NumericalColumnWriter = column_opt.unwrap_or_default();
-                column.force_numerical_type(numerical_type);
-                column
-            },
-        );
-    }
-
    pub fn record_numerical<T: Into<NumericalValue> + Copy>(
        &mut self,
        doc: RowId,
@@ -220,11 +270,15 @@ impl ColumnarWriter {
        });
    }

-    pub fn record_datetime(&mut self, doc: RowId, column_name: &str, datetime: crate::DateTime) {
+    pub fn record_datetime(&mut self, doc: RowId, column_name: &str, datetime: common::DateTime) {
        let (hash_map, arena) = (&mut self.datetime_field_hash_map, &mut self.arena);
        mutate_or_create_column(hash_map, column_name, |column_opt: Option<ColumnWriter>| {
            let mut column: ColumnWriter = column_opt.unwrap_or_default();
-            column.record(doc, NumericalValue::I64(datetime.timestamp_micros), arena);
+            column.record(
+                doc,
+                NumericalValue::I64(datetime.into_timestamp_nanos()),
+                arena,
+            );
            column
        });
    }
@@ -274,7 +328,12 @@ impl ColumnarWriter {
            },
        );
    }
-    pub fn serialize(&mut self, num_docs: RowId, wrt: &mut dyn io::Write) -> io::Result<()> {
+    pub fn serialize(
+        &mut self,
+        num_docs: RowId,
+        old_to_new_row_ids: Option<&[RowId]>,
+        wrt: &mut dyn io::Write,
+    ) -> io::Result<()> {
        let mut serializer = ColumnarSerializer::new(wrt);
        let mut columns: Vec<(&[u8], ColumnType, Addr)> = self
            .numerical_field_hash_map
@@ -317,35 +376,41 @@ impl ColumnarWriter {
        let mut symbol_byte_buffer: Vec<u8> = Vec::new();
        for (column_name, column_type, addr) in columns {
            match column_type {
-                ColumnType::Bool | ColumnType::DateTime => {
-                    let column_writer: ColumnWriter = if column_type == ColumnType::Bool {
-                        self.bool_field_hash_map.read(addr)
-                    } else {
-                        self.datetime_field_hash_map.read(addr)
-                    };
+                ColumnType::Bool => {
+                    let column_writer: ColumnWriter = self.bool_field_hash_map.read(addr);
                    let cardinality = column_writer.get_cardinality(num_docs);
                    let mut column_serializer =
-                        serializer.serialize_column(column_name, ColumnType::Bool);
+                        serializer.start_serialize_column(column_name, column_type);
                    serialize_bool_column(
                        cardinality,
                        num_docs,
-                        column_writer.operation_iterator(arena, &mut symbol_byte_buffer),
+                        column_writer.operation_iterator(
+                            arena,
+                            old_to_new_row_ids,
+                            &mut symbol_byte_buffer,
+                        ),
                        buffers,
                        &mut column_serializer,
                    )?;
+                    column_serializer.finalize()?;
                }
                ColumnType::IpAddr => {
                    let column_writer: ColumnWriter = self.ip_addr_field_hash_map.read(addr);
                    let cardinality = column_writer.get_cardinality(num_docs);
                    let mut column_serializer =
-                        serializer.serialize_column(column_name, ColumnType::IpAddr);
+                        serializer.start_serialize_column(column_name, ColumnType::IpAddr);
                    serialize_ip_addr_column(
                        cardinality,
                        num_docs,
-                        column_writer.operation_iterator(arena, &mut symbol_byte_buffer),
+                        column_writer.operation_iterator(
+                            arena,
+                            old_to_new_row_ids,
+                            &mut symbol_byte_buffer,
+                        ),
                        buffers,
                        &mut column_serializer,
                    )?;
+                    column_serializer.finalize()?;
                }
                ColumnType::Bytes | ColumnType::Str => {
                    let str_or_bytes_column_writer: StrOrBytesColumnWriter =
@@ -360,43 +425,75 @@ impl ColumnarWriter {
                        .column_writer
                        .get_cardinality(num_docs);
                    let mut column_serializer =
-                        serializer.serialize_column(column_name, column_type);
+                        serializer.start_serialize_column(column_name, column_type);
                    serialize_bytes_or_str_column(
                        cardinality,
                        num_docs,
+                        str_or_bytes_column_writer.sort_values_within_row,
                        dictionary_builder,
-                        str_or_bytes_column_writer
-                            .operation_iterator(arena, &mut symbol_byte_buffer),
+                        str_or_bytes_column_writer.operation_iterator(
+                            arena,
+                            old_to_new_row_ids,
+                            &mut symbol_byte_buffer,
+                        ),
                        buffers,
                        &mut column_serializer,
                    )?;
+                    column_serializer.finalize()?;
                }
-                ColumnType::I64 | ColumnType::F64 | ColumnType::U64 => {
+                ColumnType::F64 | ColumnType::I64 | ColumnType::U64 => {
                    let numerical_column_writer: NumericalColumnWriter =
                        self.numerical_field_hash_map.read(addr);
-                    let numerical_type = column_type.numerical_type().unwrap();
                    let cardinality = numerical_column_writer.cardinality(num_docs);
                    let mut column_serializer =
-                        serializer.serialize_column(column_name, ColumnType::from(numerical_type));
+                        serializer.start_serialize_column(column_name, column_type);
+                    let numerical_type = column_type.numerical_type().unwrap();
                    serialize_numerical_column(
                        cardinality,
                        num_docs,
                        numerical_type,
-                        numerical_column_writer.operation_iterator(arena, &mut symbol_byte_buffer),
+                        numerical_column_writer.operation_iterator(
+                            arena,
+                            old_to_new_row_ids,
+                            &mut symbol_byte_buffer,
+                        ),
                        buffers,
                        &mut column_serializer,
                    )?;
+                    column_serializer.finalize()?;
+                }
+                ColumnType::DateTime => {
+                    let column_writer: ColumnWriter = self.datetime_field_hash_map.read(addr);
+                    let cardinality = column_writer.get_cardinality(num_docs);
+                    let mut column_serializer =
+                        serializer.start_serialize_column(column_name, ColumnType::DateTime);
+                    serialize_numerical_column(
+                        cardinality,
+                        num_docs,
+                        NumericalType::I64,
+                        column_writer.operation_iterator(
+                            arena,
+                            old_to_new_row_ids,
+                            &mut symbol_byte_buffer,
+                        ),
+                        buffers,
+                        &mut column_serializer,
+                    )?;
+                    column_serializer.finalize()?;
                }
            };
        }
-        serializer.finalize()?;
+        serializer.finalize(num_docs)?;
        Ok(())
    }
 }

+// Serialize [Dictionary, Column, dictionary num bytes U32::LE]
+// Column: [Column Index, Column Values, column index num bytes U32::LE]
 fn serialize_bytes_or_str_column(
    cardinality: Cardinality,
    num_docs: RowId,
+    sort_values_within_row: bool,
    dictionary_builder: &DictionaryBuilder,
    operation_it: impl Iterator<Item = ColumnOperation<UnorderedId>>,
    buffers: &mut SpareBuffers,
@@ -425,6 +522,7 @@ fn serialize_bytes_or_str_column(
        operation_iterator,
        cardinality,
        num_docs,
+        sort_values_within_row,
        value_index_builders,
        u64_values,
        &mut wrt,
@@ -444,8 +542,6 @@ fn serialize_numerical_column(
    let SpareBuffers {
        value_index_builders,
        u64_values,
-        i64_values,
-        f64_values,
        ..
    } = buffers;
    match numerical_type {
@@ -454,8 +550,9 @@ fn serialize_numerical_column(
                coerce_numerical_symbol::<i64>(op_iterator),
                cardinality,
                num_docs,
+                false,
                value_index_builders,
-                i64_values,
+                u64_values,
                wrt,
            )?;
        }
@@ -464,6 +561,7 @@ fn serialize_numerical_column(
                coerce_numerical_symbol::<u64>(op_iterator),
                cardinality,
                num_docs,
+                false,
                value_index_builders,
                u64_values,
                wrt,
@@ -474,8 +572,9 @@ fn serialize_numerical_column(
                coerce_numerical_symbol::<f64>(op_iterator),
                cardinality,
                num_docs,
+                false,
                value_index_builders,
-                f64_values,
+                u64_values,
                wrt,
            )?;
        }
@@ -492,15 +591,19 @@ fn serialize_bool_column(
 ) -> io::Result<()> {
    let SpareBuffers {
        value_index_builders,
-        bool_values,
+        u64_values,
        ..
    } = buffers;
    send_to_serialize_column_mappable_to_u64(
-        column_operations_it,
+        column_operations_it.map(|bool_column_operation| match bool_column_operation {
+            ColumnOperation::NewDoc(doc) => ColumnOperation::NewDoc(doc),
+            ColumnOperation::Value(bool_val) => ColumnOperation::Value(bool_val.to_u64()),
+        }),
        cardinality,
        num_docs,
+        false,
        value_index_builders,
-        bool_values,
+        u64_values,
        wrt,
    )?;
    Ok(())
@@ -530,11 +633,11 @@ fn serialize_ip_addr_column(
 }

 fn send_to_serialize_column_mappable_to_u128<
-    T: Copy + std::fmt::Debug + Send + Sync + MonotonicallyMappableToU128 + PartialOrd,
+    T: Copy + Ord + std::fmt::Debug + Send + Sync + MonotonicallyMappableToU128 + PartialOrd,
 >(
    op_iterator: impl Iterator<Item = ColumnOperation<T>>,
    cardinality: Cardinality,
-    num_docs: RowId,
+    num_rows: RowId,
    value_index_builders: &mut PreallocatedIndexBuilders,
    values: &mut Vec<T>,
    mut wrt: impl io::Write,
@@ -556,37 +659,47 @@ where
        Cardinality::Optional => {
            let optional_index_builder = value_index_builders.borrow_optional_index_builder();
            consume_operation_iterator(op_iterator, optional_index_builder, values);
-            let optional_index = optional_index_builder.finish(num_docs);
-            SerializableColumnIndex::Optional(Box::new(optional_index))
+            let optional_index = optional_index_builder.finish(num_rows);
+            SerializableColumnIndex::Optional {
+                num_rows,
+                non_null_row_ids: Box::new(optional_index),
+            }
        }
        Cardinality::Multivalued => {
            let multivalued_index_builder = value_index_builders.borrow_multivalued_index_builder();
            consume_operation_iterator(op_iterator, multivalued_index_builder, values);
-            let multivalued_index = multivalued_index_builder.finish(num_docs);
+            let multivalued_index = multivalued_index_builder.finish(num_rows);
            SerializableColumnIndex::Multivalued(Box::new(multivalued_index))
        }
    };
    crate::column::serialize_column_mappable_to_u128(
        serializable_column_index,
-        || values.iter().cloned(),
-        values.len() as u32,
+        &&values[..],
        &mut wrt,
    )?;
    Ok(())
 }

-fn send_to_serialize_column_mappable_to_u64<
-    T: Copy + Default + std::fmt::Debug + Send + Sync + MonotonicallyMappableToU64 + PartialOrd,
->(
-    op_iterator: impl Iterator<Item = ColumnOperation<T>>,
+fn sort_values_within_row_in_place(multivalued_index: &[RowId], values: &mut [u64]) {
+    let mut start_index: usize = 0;
+    for end_index in multivalued_index.iter().copied() {
+        let end_index = end_index as usize;
+        values[start_index..end_index].sort_unstable();
+        start_index = end_index;
+    }
+}
+
+fn send_to_serialize_column_mappable_to_u64(
+    op_iterator: impl Iterator<Item = ColumnOperation<u64>>,
    cardinality: Cardinality,
-    num_docs: RowId,
+    num_rows: RowId,
+    sort_values_within_row: bool,
    value_index_builders: &mut PreallocatedIndexBuilders,
-    values: &mut Vec<T>,
+    values: &mut Vec<u64>,
    mut wrt: impl io::Write,
 ) -> io::Result<()>
 where
-    for<'a> VecColumn<'a, T>: ColumnValues<T>,
+    for<'a> VecColumn<'a, u64>: ColumnValues<u64>,
 {
    values.clear();
    let serializable_column_index = match cardinality {
@@ -601,19 +714,25 @@ where
        Cardinality::Optional => {
            let optional_index_builder = value_index_builders.borrow_optional_index_builder();
            consume_operation_iterator(op_iterator, optional_index_builder, values);
-            let optional_index = optional_index_builder.finish(num_docs);
-            SerializableColumnIndex::Optional(Box::new(optional_index))
+            let optional_index = optional_index_builder.finish(num_rows);
+            SerializableColumnIndex::Optional {
+                non_null_row_ids: Box::new(optional_index),
+                num_rows,
+            }
        }
        Cardinality::Multivalued => {
            let multivalued_index_builder = value_index_builders.borrow_multivalued_index_builder();
            consume_operation_iterator(op_iterator, multivalued_index_builder, values);
-            let multivalued_index = multivalued_index_builder.finish(num_docs);
+            let multivalued_index = multivalued_index_builder.finish(num_rows);
+            if sort_values_within_row {
+                sort_values_within_row_in_place(multivalued_index, values);
+            }
            SerializableColumnIndex::Multivalued(Box::new(multivalued_index))
        }
    };
    crate::column::serialize_column_mappable_to_u64(
        serializable_column_index,
-        &VecColumn::from(&values[..]),
+        &&values[..],
        &mut wrt,
    )?;
    Ok(())
@@ -621,17 +740,17 @@ where

 fn coerce_numerical_symbol<T>(
    operation_iterator: impl Iterator<Item = ColumnOperation<NumericalValue>>,
-) -> impl Iterator<Item = ColumnOperation<T>>
-where T: Coerce {
+) -> impl Iterator<Item = ColumnOperation<u64>>
+where T: Coerce + MonotonicallyMappableToU64 {
    operation_iterator.map(|symbol| match symbol {
        ColumnOperation::NewDoc(doc) => ColumnOperation::NewDoc(doc),
        ColumnOperation::Value(numerical_value) => {
-            ColumnOperation::Value(Coerce::coerce(numerical_value))
+            ColumnOperation::Value(T::coerce(numerical_value).to_u64())
        }
    })
 }

-fn consume_operation_iterator<T: std::fmt::Debug, TIndexBuilder: IndexBuilder>(
+fn consume_operation_iterator<T: Ord, TIndexBuilder: IndexBuilder>(
    operation_iterator: impl Iterator<Item = ColumnOperation<T>>,
    index_builder: &mut TIndexBuilder,
    values: &mut Vec<T>,
@@ -666,7 +785,7 @@ mod tests {
        assert_eq!(column_writer.get_cardinality(3), Cardinality::Full);
        let mut buffer = Vec::new();
        let symbols: Vec<ColumnOperation<NumericalValue>> = column_writer
-            .operation_iterator(&mut arena, &mut buffer)
+            .operation_iterator(&arena, None, &mut buffer)
            .collect();
        assert_eq!(symbols.len(), 6);
        assert!(matches!(symbols[0], ColumnOperation::NewDoc(0u32)));
@@ -695,7 +814,7 @@ mod tests {
        assert_eq!(column_writer.get_cardinality(3), Cardinality::Optional);
        let mut buffer = Vec::new();
        let symbols: Vec<ColumnOperation<NumericalValue>> = column_writer
-            .operation_iterator(&mut arena, &mut buffer)
+            .operation_iterator(&arena, None, &mut buffer)
            .collect();
        assert_eq!(symbols.len(), 4);
        assert!(matches!(symbols[0], ColumnOperation::NewDoc(1u32)));
@@ -718,7 +837,7 @@ mod tests {
        assert_eq!(column_writer.get_cardinality(2), Cardinality::Optional);
        let mut buffer = Vec::new();
        let symbols: Vec<ColumnOperation<NumericalValue>> = column_writer
-            .operation_iterator(&mut arena, &mut buffer)
+            .operation_iterator(&arena, None, &mut buffer)
            .collect();
        assert_eq!(symbols.len(), 2);
        assert!(matches!(symbols[0], ColumnOperation::NewDoc(0u32)));
@@ -737,7 +856,7 @@ mod tests {
        assert_eq!(column_writer.get_cardinality(1), Cardinality::Multivalued);
        let mut buffer = Vec::new();
        let symbols: Vec<ColumnOperation<NumericalValue>> = column_writer
-            .operation_iterator(&mut arena, &mut buffer)
+            .operation_iterator(&arena, None, &mut buffer)
            .collect();
        assert_eq!(symbols.len(), 3);
        assert!(matches!(symbols[0], ColumnOperation::NewDoc(0u32)));
--- a/columnar/src/columnar/writer/serializer.rs
+++ b/columnar/src/columnar/writer/serializer.rs
@@ -1,11 +1,12 @@
 use std::io;
 use std::io::Write;

-use common::CountingWriter;
+use common::{BinarySerializable, CountingWriter};
 use sstable::value::RangeValueWriter;
 use sstable::RangeSSTable;

 use crate::columnar::ColumnType;
+use crate::RowId;

 pub struct ColumnarSerializer<W: io::Write> {
    wrt: CountingWriter<W>,
@@ -33,11 +34,12 @@ impl<W: io::Write> ColumnarSerializer<W> {
        }
    }

-    pub fn serialize_column<'a>(
+    /// Creates a ColumnSerializer.
+    pub fn start_serialize_column<'a>(
        &'a mut self,
        column_name: &[u8],
        column_type: ColumnType,
-    ) -> impl io::Write + 'a {
+    ) -> ColumnSerializer<'a, W> {
        let start_offset = self.wrt.written_bytes();
        prepare_key(column_name, column_type, &mut self.prepare_key_buffer);
        ColumnSerializer {
@@ -46,11 +48,12 @@ impl<W: io::Write> ColumnarSerializer<W> {
        }
    }

-    pub(crate) fn finalize(mut self) -> io::Result<()> {
+    pub(crate) fn finalize(mut self, num_rows: RowId) -> io::Result<()> {
        let sstable_bytes: Vec<u8> = self.sstable_range.finish()?;
        let sstable_num_bytes: u64 = sstable_bytes.len() as u64;
        self.wrt.write_all(&sstable_bytes)?;
        self.wrt.write_all(&sstable_num_bytes.to_le_bytes()[..])?;
+        num_rows.serialize(&mut self.wrt)?;
        self.wrt
            .write_all(&super::super::format_version::footer())?;
        self.wrt.flush()?;
@@ -58,20 +61,21 @@ impl<W: io::Write> ColumnarSerializer<W> {
    }
 }

-struct ColumnSerializer<'a, W: io::Write> {
+pub struct ColumnSerializer<'a, W: io::Write> {
    columnar_serializer: &'a mut ColumnarSerializer<W>,
    start_offset: u64,
 }

-impl<'a, W: io::Write> Drop for ColumnSerializer<'a, W> {
-    fn drop(&mut self) {
+impl<'a, W: io::Write> ColumnSerializer<'a, W> {
+    pub fn finalize(self) -> io::Result<()> {
        let end_offset: u64 = self.columnar_serializer.wrt.written_bytes();
        let byte_range = self.start_offset..end_offset;
-        self.columnar_serializer.sstable_range.insert_cannot_fail(
+        self.columnar_serializer.sstable_range.insert(
            &self.columnar_serializer.prepare_key_buffer[..],
            &byte_range,
-        );
+        )?;
        self.columnar_serializer.prepare_key_buffer.clear();
+        Ok(())
    }
 }

--- a/columnar/src/columnar/writer/value_index.rs
+++ b/columnar/src/columnar/writer/value_index.rs
@@ -1,5 +1,4 @@
-use crate::column_index::SerializableOptionalIndex;
-use crate::column_values::{ColumnValues, VecColumn};
+use crate::iterable::Iterable;
 use crate::RowId;

 /// The `IndexBuilder` interprets a sequence of
@@ -29,34 +28,15 @@ pub struct OptionalIndexBuilder {
    docs: Vec<RowId>,
 }

-struct SingleValueArrayIndex<'a> {
-    // RowIds with a value, in a strictly increasing order
-    row_ids: &'a [RowId],
-    num_rows: RowId,
-}
-
-impl<'a> SerializableOptionalIndex<'a> for SingleValueArrayIndex<'a> {
-    fn num_rows(&self) -> RowId {
-        self.num_rows
-    }
-
-    fn non_null_rows(&self) -> Box<dyn Iterator<Item = RowId> + 'a> {
-        Box::new(self.row_ids.iter().copied())
-    }
-}
-
 impl OptionalIndexBuilder {
-    pub fn finish<'a>(&'a mut self, num_rows: RowId) -> impl SerializableOptionalIndex + 'a {
+    pub fn finish(&mut self, num_rows: RowId) -> impl Iterable<RowId> + '_ {
        debug_assert!(self
            .docs
            .last()
            .copied()
            .map(|last_doc| last_doc < num_rows)
            .unwrap_or(true));
-        SingleValueArrayIndex {
-            row_ids: &self.docs[..],
-            num_rows,
-        }
+        &self.docs[..]
    }

    fn reset(&mut self) {
@@ -84,14 +64,10 @@ pub struct MultivaluedIndexBuilder {
 }

 impl MultivaluedIndexBuilder {
-    pub fn finish(&mut self, num_docs: RowId) -> impl ColumnValues<u32> + '_ {
+    pub fn finish(&mut self, num_docs: RowId) -> &[u32] {
        self.start_offsets
            .resize(num_docs as usize + 1, self.total_num_vals_seen);
-        VecColumn {
-            values: &&self.start_offsets[..],
-            min_value: 0,
-            max_value: self.start_offsets.last().copied().unwrap_or(0),
-        }
+        &self.start_offsets[..]
    }

    fn reset(&mut self) {
@@ -149,7 +125,7 @@ mod tests {
        assert_eq!(
            &opt_value_index_builder
                .finish(1u32)
-                .non_null_rows()
+                .boxed_iter()
                .collect::<Vec<u32>>(),
            &[0]
        );
@@ -159,7 +135,7 @@ mod tests {
        assert_eq!(
            &opt_value_index_builder
                .finish(2u32)
-                .non_null_rows()
+                .boxed_iter()
                .collect::<Vec<u32>>(),
            &[1]
        );
@@ -174,10 +150,7 @@ mod tests {
        multivalued_value_index_builder.record_row(2u32);
        multivalued_value_index_builder.record_value();
        assert_eq!(
-            multivalued_value_index_builder
-                .finish(4u32)
-                .iter()
-                .collect::<Vec<u32>>(),
+            multivalued_value_index_builder.finish(4u32).to_vec(),
            vec![0, 0, 2, 3, 3]
        );
        multivalued_value_index_builder.reset();
@@ -185,10 +158,7 @@ mod tests {
        multivalued_value_index_builder.record_value();
        multivalued_value_index_builder.record_value();
        assert_eq!(
-            multivalued_value_index_builder
-                .finish(4u32)
-                .iter()
-                .collect::<Vec<u32>>(),
+            multivalued_value_index_builder.finish(4u32).to_vec(),
            vec![0, 0, 0, 2, 2]
        );
    }
--- a/columnar/src/dictionary.rs
+++ b/columnar/src/dictionary.rs
@@ -32,6 +32,7 @@ pub struct OrderedId(pub u32);
 #[derive(Default)]
 pub(crate) struct DictionaryBuilder {
    dict: FnvHashMap<Vec<u8>, UnorderedId>,
+    memory_consumption: usize,
 }

 impl DictionaryBuilder {
@@ -43,6 +44,8 @@ impl DictionaryBuilder {
        }
        let new_id = UnorderedId(self.dict.len() as u32);
        self.dict.insert(term.to_vec(), new_id);
+        self.memory_consumption += term.len();
+        self.memory_consumption += 40; // Term Metadata + HashMap overhead
        new_id
    }

@@ -63,6 +66,10 @@ impl DictionaryBuilder {
        sstable_builder.finish()?;
        Ok(TermIdMapping { unordered_to_ord })
    }
+
+    pub(crate) fn mem_usage(&self) -> usize {
+        self.memory_consumption
+    }
 }

 #[cfg(test)]
--- a/columnar/src/dynamic_column.rs
+++ b/columnar/src/dynamic_column.rs
@@ -1,14 +1,14 @@
-use std::io;
 use std::net::Ipv6Addr;
 use std::sync::Arc;
+use std::{fmt, io};

 use common::file_slice::FileSlice;
-use common::{HasLen, OwnedBytes};
+use common::{ByteCount, DateTime, HasLen, OwnedBytes};

 use crate::column::{BytesColumn, Column, StrColumn};
 use crate::column_values::{monotonic_map_column, StrictlyMonotonicFn};
 use crate::columnar::ColumnType;
-use crate::{DateTime, NumericalType};
+use crate::{Cardinality, ColumnIndex, NumericalType};

 #[derive(Clone)]
 pub enum DynamicColumn {
@@ -22,7 +22,54 @@ pub enum DynamicColumn {
    Str(StrColumn),
 }

+impl fmt::Debug for DynamicColumn {
+    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
+        write!(f, "[{} {} |", self.get_cardinality(), self.column_type())?;
+        match self {
+            DynamicColumn::Bool(col) => write!(f, " {col:?}")?,
+            DynamicColumn::I64(col) => write!(f, " {col:?}")?,
+            DynamicColumn::U64(col) => write!(f, " {col:?}")?,
+            DynamicColumn::F64(col) => write!(f, "{col:?}")?,
+            DynamicColumn::IpAddr(col) => write!(f, "{col:?}")?,
+            DynamicColumn::DateTime(col) => write!(f, "{col:?}")?,
+            DynamicColumn::Bytes(col) => write!(f, "{col:?}")?,
+            DynamicColumn::Str(col) => write!(f, "{col:?}")?,
+        }
+        write!(f, "]")
+    }
+}
+
 impl DynamicColumn {
+    pub fn column_index(&self) -> &ColumnIndex {
+        match self {
+            DynamicColumn::Bool(c) => &c.index,
+            DynamicColumn::I64(c) => &c.index,
+            DynamicColumn::U64(c) => &c.index,
+            DynamicColumn::F64(c) => &c.index,
+            DynamicColumn::IpAddr(c) => &c.index,
+            DynamicColumn::DateTime(c) => &c.index,
+            DynamicColumn::Bytes(c) => &c.ords().index,
+            DynamicColumn::Str(c) => &c.ords().index,
+        }
+    }
+
+    pub fn get_cardinality(&self) -> Cardinality {
+        self.column_index().get_cardinality()
+    }
+
+    pub fn num_values(&self) -> u32 {
+        match self {
+            DynamicColumn::Bool(c) => c.values.num_vals(),
+            DynamicColumn::I64(c) => c.values.num_vals(),
+            DynamicColumn::U64(c) => c.values.num_vals(),
+            DynamicColumn::F64(c) => c.values.num_vals(),
+            DynamicColumn::IpAddr(c) => c.values.num_vals(),
+            DynamicColumn::DateTime(c) => c.values.num_vals(),
+            DynamicColumn::Bytes(c) => c.ords().values.num_vals(),
+            DynamicColumn::Str(c) => c.ords().values.num_vals(),
+        }
+    }
+
    pub fn column_type(&self) -> ColumnType {
        match self {
            DynamicColumn::Bool(_) => ColumnType::Bool,
@@ -36,6 +83,14 @@ impl DynamicColumn {
        }
    }

+    pub fn coerce_numerical(self, target_numerical_type: NumericalType) -> Option<Self> {
+        match target_numerical_type {
+            NumericalType::I64 => self.coerce_to_i64(),
+            NumericalType::U64 => self.coerce_to_u64(),
+            NumericalType::F64 => self.coerce_to_f64(),
+        }
+    }
+
    pub fn is_numerical(&self) -> bool {
        self.column_type().numerical_type().is_some()
    }
@@ -50,28 +105,28 @@ impl DynamicColumn {
        self.column_type().numerical_type() == Some(NumericalType::U64)
    }

-    pub fn coerce_to_f64(self) -> Option<DynamicColumn> {
+    fn coerce_to_f64(self) -> Option<DynamicColumn> {
        match self {
            DynamicColumn::I64(column) => Some(DynamicColumn::F64(Column {
-                idx: column.idx,
+                index: column.index,
                values: Arc::new(monotonic_map_column(column.values, MapI64ToF64)),
            })),
            DynamicColumn::U64(column) => Some(DynamicColumn::F64(Column {
-                idx: column.idx,
+                index: column.index,
                values: Arc::new(monotonic_map_column(column.values, MapU64ToF64)),
            })),
            DynamicColumn::F64(_) => Some(self),
            _ => None,
        }
    }
-    pub fn coerce_to_i64(self) -> Option<DynamicColumn> {
+    fn coerce_to_i64(self) -> Option<DynamicColumn> {
        match self {
            DynamicColumn::U64(column) => {
                if column.max_value() > i64::MAX as u64 {
                    return None;
                }
                Some(DynamicColumn::I64(Column {
-                    idx: column.idx,
+                    index: column.index,
                    values: Arc::new(monotonic_map_column(column.values, MapU64ToI64)),
                }))
            }
@@ -79,14 +134,14 @@ impl DynamicColumn {
            _ => None,
        }
    }
-    pub fn coerce_to_u64(self) -> Option<DynamicColumn> {
+    fn coerce_to_u64(self) -> Option<DynamicColumn> {
        match self {
            DynamicColumn::I64(column) => {
                if column.min_value() < 0 {
                    return None;
                }
                Some(DynamicColumn::U64(Column {
-                    idx: column.idx,
+                    index: column.index,
                    values: Arc::new(monotonic_map_column(column.values, MapI64ToU64)),
                }))
            }
@@ -146,9 +201,9 @@ impl StrictlyMonotonicFn<i64, u64> for MapI64ToU64 {

 macro_rules! static_dynamic_conversions {
    ($typ:ty, $enum_name:ident) => {
-        impl Into<Option<$typ>> for DynamicColumn {
-            fn into(self) -> Option<$typ> {
-                if let DynamicColumn::$enum_name(col) = self {
+        impl From<DynamicColumn> for Option<$typ> {
+            fn from(dynamic_column: DynamicColumn) -> Option<$typ> {
+                if let DynamicColumn::$enum_name(col) = dynamic_column {
                    Some(col)
                } else {
                    None
@@ -168,12 +223,12 @@ static_dynamic_conversions!(Column<bool>, Bool);
 static_dynamic_conversions!(Column<u64>, U64);
 static_dynamic_conversions!(Column<i64>, I64);
 static_dynamic_conversions!(Column<f64>, F64);
-static_dynamic_conversions!(Column<crate::DateTime>, DateTime);
+static_dynamic_conversions!(Column<DateTime>, DateTime);
 static_dynamic_conversions!(StrColumn, Str);
 static_dynamic_conversions!(BytesColumn, Bytes);
 static_dynamic_conversions!(Column<Ipv6Addr>, IpAddr);

-#[derive(Clone)]
+#[derive(Clone, Debug)]
 pub struct DynamicColumnHandle {
    pub(crate) file_slice: FileSlice,
    pub(crate) column_type: ColumnType,
@@ -186,14 +241,13 @@ impl DynamicColumnHandle {
        self.open_internal(column_bytes)
    }

-    // TODO rename load_async
-    pub async fn open_async(&self) -> io::Result<DynamicColumn> {
-        let column_bytes: OwnedBytes = self.file_slice.read_bytes_async().await?;
-        self.open_internal(column_bytes)
+    #[doc(hidden)]
+    pub fn file_slice(&self) -> &FileSlice {
+        &self.file_slice
    }

    /// Returns the `u64` fast field reader reader associated with `fields` of types
-    /// Str, u64, i64, f64, or datetime.
+    /// Str, u64, i64, f64, bool, or datetime.
    ///
    /// If not, the fastfield reader will returns the u64-value associated with the original
    /// FastValue.
@@ -204,9 +258,12 @@ impl DynamicColumnHandle {
                let column: BytesColumn = crate::column::open_column_bytes(column_bytes)?;
                Ok(Some(column.term_ord_column))
            }
-            ColumnType::Bool => Ok(None),
            ColumnType::IpAddr => Ok(None),
-            ColumnType::I64 | ColumnType::U64 | ColumnType::F64 | ColumnType::DateTime => {
+            ColumnType::Bool
+            | ColumnType::I64
+            | ColumnType::U64
+            | ColumnType::F64
+            | ColumnType::DateTime => {
                let column = crate::column::open_column_u64::<u64>(column_bytes)?;
                Ok(Some(column))
            }
@@ -215,24 +272,22 @@ impl DynamicColumnHandle {

    fn open_internal(&self, column_bytes: OwnedBytes) -> io::Result<DynamicColumn> {
        let dynamic_column: DynamicColumn = match self.column_type {
-            ColumnType::Bytes => {
-                crate::column::open_column_bytes::<BytesColumn>(column_bytes)?.into()
-            }
-            ColumnType::Str => crate::column::open_column_bytes::<StrColumn>(column_bytes)?.into(),
+            ColumnType::Bytes => crate::column::open_column_bytes(column_bytes)?.into(),
+            ColumnType::Str => crate::column::open_column_str(column_bytes)?.into(),
            ColumnType::I64 => crate::column::open_column_u64::<i64>(column_bytes)?.into(),
            ColumnType::U64 => crate::column::open_column_u64::<u64>(column_bytes)?.into(),
            ColumnType::F64 => crate::column::open_column_u64::<f64>(column_bytes)?.into(),
            ColumnType::Bool => crate::column::open_column_u64::<bool>(column_bytes)?.into(),
            ColumnType::IpAddr => crate::column::open_column_u128::<Ipv6Addr>(column_bytes)?.into(),
            ColumnType::DateTime => {
-                crate::column::open_column_u64::<crate::DateTime>(column_bytes)?.into()
+                crate::column::open_column_u64::<DateTime>(column_bytes)?.into()
            }
        };
        Ok(dynamic_column)
    }

-    pub fn num_bytes(&self) -> usize {
-        self.file_slice.len()
+    pub fn num_bytes(&self) -> ByteCount {
+        self.file_slice.len().into()
    }

    pub fn column_type(&self) -> ColumnType {
--- a/columnar/src/iterable.rs
+++ b/columnar/src/iterable.rs
@@ -0,0 +1,19 @@
+use std::ops::Range;
+
+pub trait Iterable<T = u64> {
+    fn boxed_iter(&self) -> Box<dyn Iterator<Item = T> + '_>;
+}
+
+impl<'a, T: Copy> Iterable<T> for &'a [T] {
+    fn boxed_iter(&self) -> Box<dyn Iterator<Item = T> + '_> {
+        Box::new(self.iter().copied())
+    }
+}
+
+impl<T: Copy> Iterable<T> for Range<T>
+where Range<T>: Iterator<Item = T>
+{
+    fn boxed_iter(&self) -> Box<dyn Iterator<Item = T> + '_> {
+        Box::new(self.clone())
+    }
+}
--- a/columnar/src/lib.rs
+++ b/columnar/src/lib.rs
@@ -7,34 +7,49 @@ extern crate more_asserts;
 #[cfg(all(test, feature = "unstable"))]
 extern crate test;

+use std::fmt::Display;
 use std::io;

+mod block_accessor;
 mod column;
 mod column_index;
-mod column_values;
+pub mod column_values;
 mod columnar;
 mod dictionary;
 mod dynamic_column;
+mod iterable;
 pub(crate) mod utils;
 mod value;

+pub use block_accessor::ColumnBlockAccessor;
 pub use column::{BytesColumn, Column, StrColumn};
-pub use column_values::ColumnValues;
+pub use column_index::ColumnIndex;
+pub use column_values::{
+    ColumnValues, EmptyColumnValues, MonotonicallyMappableToU128, MonotonicallyMappableToU64,
+};
 pub use columnar::{
    merge_columnar, ColumnType, ColumnarReader, ColumnarWriter, HasAssociatedColumnType,
-    MergeDocOrder,
+    MergeRowOrder, ShuffleMergeOrder, StackMergeOrder,
 };
+use sstable::VoidSSTable;
 pub use value::{NumericalType, NumericalValue};

 pub use self::dynamic_column::{DynamicColumn, DynamicColumnHandle};

 pub type RowId = u32;
+pub type DocId = u32;

-#[derive(Clone, Copy, PartialOrd, PartialEq, Default, Debug)]
-pub struct DateTime {
-    pub timestamp_micros: i64,
+#[derive(Clone, Copy, Debug)]
+pub struct RowAddr {
+    pub segment_ord: u32,
+    pub row_id: RowId,
 }

+pub use sstable::Dictionary;
+pub type Streamer<'a> = sstable::Streamer<'a, VoidSSTable>;
+
+pub use common::DateTime;
+
 #[derive(Copy, Clone, Debug)]
 pub struct InvalidData;

@@ -61,11 +76,27 @@ pub enum Cardinality {
    Multivalued = 2,
 }

+impl Display for Cardinality {
+    fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
+        let short_str = match self {
+            Cardinality::Full => "full",
+            Cardinality::Optional => "opt",
+            Cardinality::Multivalued => "mult",
+        };
+        write!(f, "{short_str}")
+    }
+}
+
 impl Cardinality {
+    pub fn is_optional(&self) -> bool {
+        matches!(self, Cardinality::Optional)
+    }
+    pub fn is_multivalue(&self) -> bool {
+        matches!(self, Cardinality::Multivalued)
+    }
    pub(crate) fn to_code(self) -> u8 {
        self as u8
    }
-
    pub(crate) fn try_from_code(code: u8) -> Result<Cardinality, InvalidData> {
        match code {
            0 => Ok(Cardinality::Full),
--- a/columnar/src/tests.rs
+++ b/columnar/src/tests.rs
@@ -1,10 +1,19 @@
+use std::collections::HashMap;
+use std::fmt::Debug;
 use std::net::Ipv6Addr;

+use common::DateTime;
+use proptest::prelude::*;
+use proptest::sample::subsequence;
+
 use crate::column_values::MonotonicallyMappableToU128;
-use crate::columnar::ColumnType;
+use crate::columnar::{ColumnType, ColumnTypeCategory};
 use crate::dynamic_column::{DynamicColumn, DynamicColumnHandle};
-use crate::value::NumericalValue;
-use crate::{Cardinality, ColumnarReader, ColumnarWriter};
+use crate::value::{Coerce, NumericalValue};
+use crate::{
+    BytesColumn, Cardinality, Column, ColumnarReader, ColumnarWriter, RowAddr, RowId,
+    ShuffleMergeOrder, StackMergeOrder,
+};

 #[test]
 fn test_dataframe_writer_str() {
@@ -12,12 +21,12 @@ fn test_dataframe_writer_str() {
    dataframe_writer.record_str(1u32, "my_string", "hello");
    dataframe_writer.record_str(3u32, "my_string", "helloeee");
    let mut buffer: Vec<u8> = Vec::new();
-    dataframe_writer.serialize(5, &mut buffer).unwrap();
+    dataframe_writer.serialize(5, None, &mut buffer).unwrap();
    let columnar = ColumnarReader::open(buffer).unwrap();
    assert_eq!(columnar.num_columns(), 1);
    let cols: Vec<DynamicColumnHandle> = columnar.read_columns("my_string").unwrap();
    assert_eq!(cols.len(), 1);
-    assert_eq!(cols[0].num_bytes(), 158);
+    assert_eq!(cols[0].num_bytes(), 87);
 }

 #[test]
@@ -26,12 +35,12 @@ fn test_dataframe_writer_bytes() {
    dataframe_writer.record_bytes(1u32, "my_string", b"hello");
    dataframe_writer.record_bytes(3u32, "my_string", b"helloeee");
    let mut buffer: Vec<u8> = Vec::new();
-    dataframe_writer.serialize(5, &mut buffer).unwrap();
+    dataframe_writer.serialize(5, None, &mut buffer).unwrap();
    let columnar = ColumnarReader::open(buffer).unwrap();
    assert_eq!(columnar.num_columns(), 1);
    let cols: Vec<DynamicColumnHandle> = columnar.read_columns("my_string").unwrap();
    assert_eq!(cols.len(), 1);
-    assert_eq!(cols[0].num_bytes(), 158);
+    assert_eq!(cols[0].num_bytes(), 87);
 }

 #[test]
@@ -40,7 +49,7 @@ fn test_dataframe_writer_bool() {
    dataframe_writer.record_bool(1u32, "bool.value", false);
    dataframe_writer.record_bool(3u32, "bool.value", true);
    let mut buffer: Vec<u8> = Vec::new();
-    dataframe_writer.serialize(5, &mut buffer).unwrap();
+    dataframe_writer.serialize(5, None, &mut buffer).unwrap();
    let columnar = ColumnarReader::open(buffer).unwrap();
    assert_eq!(columnar.num_columns(), 1);
    let cols: Vec<DynamicColumnHandle> = columnar.read_columns("bool.value").unwrap();
@@ -48,7 +57,9 @@ fn test_dataframe_writer_bool() {
    assert_eq!(cols[0].num_bytes(), 22);
    assert_eq!(cols[0].column_type(), ColumnType::Bool);
    let dyn_bool_col = cols[0].open().unwrap();
-    let DynamicColumn::Bool(bool_col) = dyn_bool_col else { panic!(); };
+    let DynamicColumn::Bool(bool_col) = dyn_bool_col else {
+        panic!();
+    };
    let vals: Vec<Option<bool>> = (0..5).map(|row_id| bool_col.first(row_id)).collect();
    assert_eq!(&vals, &[None, Some(false), None, Some(true), None,]);
 }
@@ -63,19 +74,21 @@ fn test_dataframe_writer_u64_multivalued() {
    dataframe_writer.record_numerical(6u32, "divisor", 2u64);
    dataframe_writer.record_numerical(6u32, "divisor", 3u64);
    let mut buffer: Vec<u8> = Vec::new();
-    dataframe_writer.serialize(7, &mut buffer).unwrap();
+    dataframe_writer.serialize(7, None, &mut buffer).unwrap();
    let columnar = ColumnarReader::open(buffer).unwrap();
    assert_eq!(columnar.num_columns(), 1);
    let cols: Vec<DynamicColumnHandle> = columnar.read_columns("divisor").unwrap();
    assert_eq!(cols.len(), 1);
    assert_eq!(cols[0].num_bytes(), 29);
    let dyn_i64_col = cols[0].open().unwrap();
-    let DynamicColumn::I64(divisor_col) = dyn_i64_col else { panic!(); };
+    let DynamicColumn::I64(divisor_col) = dyn_i64_col else {
+        panic!();
+    };
    assert_eq!(
        divisor_col.get_cardinality(),
        crate::Cardinality::Multivalued
    );
-    assert_eq!(divisor_col.num_rows(), 7);
+    assert_eq!(divisor_col.num_docs(), 7);
 }

 #[test]
@@ -84,7 +97,7 @@ fn test_dataframe_writer_ip_addr() {
    dataframe_writer.record_ip_addr(1, "ip_addr", Ipv6Addr::from_u128(1001));
    dataframe_writer.record_ip_addr(3, "ip_addr", Ipv6Addr::from_u128(1050));
    let mut buffer: Vec<u8> = Vec::new();
-    dataframe_writer.serialize(5, &mut buffer).unwrap();
+    dataframe_writer.serialize(5, None, &mut buffer).unwrap();
    let columnar = ColumnarReader::open(buffer).unwrap();
    assert_eq!(columnar.num_columns(), 1);
    let cols: Vec<DynamicColumnHandle> = columnar.read_columns("ip_addr").unwrap();
@@ -92,7 +105,9 @@ fn test_dataframe_writer_ip_addr() {
    assert_eq!(cols[0].num_bytes(), 42);
    assert_eq!(cols[0].column_type(), ColumnType::IpAddr);
    let dyn_bool_col = cols[0].open().unwrap();
-    let DynamicColumn::IpAddr(ip_col) = dyn_bool_col else { panic!(); };
+    let DynamicColumn::IpAddr(ip_col) = dyn_bool_col else {
+        panic!();
+    };
    let vals: Vec<Option<Ipv6Addr>> = (0..5).map(|row_id| ip_col.first(row_id)).collect();
    assert_eq!(
        &vals,
@@ -113,7 +128,7 @@ fn test_dataframe_writer_numerical() {
    dataframe_writer.record_numerical(2u32, "srical.value", NumericalValue::U64(13u64));
    dataframe_writer.record_numerical(4u32, "srical.value", NumericalValue::U64(15u64));
    let mut buffer: Vec<u8> = Vec::new();
-    dataframe_writer.serialize(6, &mut buffer).unwrap();
+    dataframe_writer.serialize(6, None, &mut buffer).unwrap();
    let columnar = ColumnarReader::open(buffer).unwrap();
    assert_eq!(columnar.num_columns(), 1);
    let cols: Vec<DynamicColumnHandle> = columnar.read_columns("srical.value").unwrap();
@@ -125,8 +140,10 @@ fn test_dataframe_writer_numerical() {
    // - null footer 6 bytes
    assert_eq!(cols[0].num_bytes(), 33);
    let column = cols[0].open().unwrap();
-    let DynamicColumn::I64(column_i64) = column else { panic!(); };
-    assert_eq!(column_i64.idx.get_cardinality(), Cardinality::Optional);
+    let DynamicColumn::I64(column_i64) = column else {
+        panic!();
+    };
+    assert_eq!(column_i64.index.get_cardinality(), Cardinality::Optional);
    assert_eq!(column_i64.first(0), None);
    assert_eq!(column_i64.first(1), Some(12i64));
    assert_eq!(column_i64.first(2), Some(13i64));
@@ -136,6 +153,46 @@ fn test_dataframe_writer_numerical() {
    assert_eq!(column_i64.first(6), None); //< we can change the spec for that one.
 }

+#[test]
+fn test_dataframe_sort_by_full() {
+    let mut dataframe_writer = ColumnarWriter::default();
+    dataframe_writer.record_numerical(0u32, "value", NumericalValue::U64(1));
+    dataframe_writer.record_numerical(1u32, "value", NumericalValue::U64(2));
+    let data = dataframe_writer.sort_order("value", 2, false);
+    assert_eq!(data, vec![0, 1]);
+}
+
+#[test]
+fn test_dataframe_sort_by_opt() {
+    let mut dataframe_writer = ColumnarWriter::default();
+    dataframe_writer.record_numerical(1u32, "value", NumericalValue::U64(3));
+    dataframe_writer.record_numerical(3u32, "value", NumericalValue::U64(2));
+    let data = dataframe_writer.sort_order("value", 5, false);
+    // 0, 2, 4 is 0.0
+    assert_eq!(data, vec![0, 2, 4, 3, 1]);
+    let data = dataframe_writer.sort_order("value", 5, true);
+    assert_eq!(
+        data,
+        vec![4, 2, 0, 3, 1].into_iter().rev().collect::<Vec<_>>()
+    );
+}
+
+#[test]
+fn test_dataframe_sort_by_multi() {
+    let mut dataframe_writer = ColumnarWriter::default();
+    // valid for sort
+    dataframe_writer.record_numerical(1u32, "value", NumericalValue::U64(2));
+    // those are ignored for sort
+    dataframe_writer.record_numerical(1u32, "value", NumericalValue::U64(4));
+    dataframe_writer.record_numerical(1u32, "value", NumericalValue::U64(4));
+    // valid for sort
+    dataframe_writer.record_numerical(3u32, "value", NumericalValue::U64(3));
+    // ignored, would change sort order
+    dataframe_writer.record_numerical(3u32, "value", NumericalValue::U64(1));
+    let data = dataframe_writer.sort_order("value", 4, false);
+    assert_eq!(data, vec![0, 2, 1, 3]);
+}
+
 #[test]
 fn test_dictionary_encoded_str() {
    let mut buffer = Vec::new();
@@ -144,12 +201,14 @@ fn test_dictionary_encoded_str() {
    columnar_writer.record_str(3, "my.column", "c");
    columnar_writer.record_str(3, "my.column2", "different_column!");
    columnar_writer.record_str(4, "my.column", "b");
-    columnar_writer.serialize(5, &mut buffer).unwrap();
+    columnar_writer.serialize(5, None, &mut buffer).unwrap();
    let columnar_reader = ColumnarReader::open(buffer).unwrap();
    assert_eq!(columnar_reader.num_columns(), 2);
    let col_handles = columnar_reader.read_columns("my.column").unwrap();
    assert_eq!(col_handles.len(), 1);
-    let DynamicColumn::Str(str_col) = col_handles[0].open().unwrap() else  { panic!(); };
+    let DynamicColumn::Str(str_col) = col_handles[0].open().unwrap() else {
+        panic!();
+    };
    let index: Vec<Option<u64>> = (0..5).map(|row_id| str_col.ords().first(row_id)).collect();
    assert_eq!(index, &[None, Some(0), None, Some(2), Some(1)]);
    assert_eq!(str_col.num_rows(), 5);
@@ -176,12 +235,14 @@ fn test_dictionary_encoded_bytes() {
    columnar_writer.record_bytes(3, "my.column", b"c");
    columnar_writer.record_bytes(3, "my.column2", b"different_column!");
    columnar_writer.record_bytes(4, "my.column", b"b");
-    columnar_writer.serialize(5, &mut buffer).unwrap();
+    columnar_writer.serialize(5, None, &mut buffer).unwrap();
    let columnar_reader = ColumnarReader::open(buffer).unwrap();
    assert_eq!(columnar_reader.num_columns(), 2);
    let col_handles = columnar_reader.read_columns("my.column").unwrap();
    assert_eq!(col_handles.len(), 1);
-    let DynamicColumn::Bytes(bytes_col) = col_handles[0].open().unwrap() else  { panic!(); };
+    let DynamicColumn::Bytes(bytes_col) = col_handles[0].open().unwrap() else {
+        panic!();
+    };
    let index: Vec<Option<u64>> = (0..5)
        .map(|row_id| bytes_col.ords().first(row_id))
        .collect();
@@ -210,3 +271,675 @@ fn test_dictionary_encoded_bytes() {
        .unwrap();
    assert_eq!(term_buffer, b"b");
 }
+
+fn num_strategy() -> impl Strategy<Value = NumericalValue> {
+    prop_oneof![
+        3 => Just(NumericalValue::U64(0u64)),
+        3 => Just(NumericalValue::U64(u64::MAX)),
+        3 => Just(NumericalValue::I64(0i64)),
+        3 => Just(NumericalValue::I64(i64::MIN)),
+        3 => Just(NumericalValue::I64(i64::MAX)),
+        3 => Just(NumericalValue::F64(1.2f64)),
+        1 => any::<f64>().prop_map(NumericalValue::from),
+        1 => any::<u64>().prop_map(NumericalValue::from),
+        1 => any::<i64>().prop_map(NumericalValue::from),
+    ]
+}
+
+#[derive(Debug, Clone, Copy)]
+enum ColumnValue {
+    Str(&'static str),
+    Bytes(&'static [u8]),
+    Numerical(NumericalValue),
+    IpAddr(Ipv6Addr),
+    Bool(bool),
+    DateTime(DateTime),
+}
+
+impl<T: Into<NumericalValue>> From<T> for ColumnValue {
+    fn from(val: T) -> ColumnValue {
+        ColumnValue::Numerical(val.into())
+    }
+}
+
+impl ColumnValue {
+    pub(crate) fn column_type_category(&self) -> ColumnTypeCategory {
+        match self {
+            ColumnValue::Str(_) => ColumnTypeCategory::Str,
+            ColumnValue::Bytes(_) => ColumnTypeCategory::Bytes,
+            ColumnValue::Numerical(_) => ColumnTypeCategory::Numerical,
+            ColumnValue::IpAddr(_) => ColumnTypeCategory::IpAddr,
+            ColumnValue::Bool(_) => ColumnTypeCategory::Bool,
+            ColumnValue::DateTime(_) => ColumnTypeCategory::DateTime,
+        }
+    }
+}
+
+fn column_name_strategy() -> impl Strategy<Value = &'static str> {
+    prop_oneof![Just("c1"), Just("c2")]
+}
+
+fn string_strategy() -> impl Strategy<Value = &'static str> {
+    prop_oneof![Just("a"), Just("b")]
+}
+
+fn bytes_strategy() -> impl Strategy<Value = &'static [u8]> {
+    prop_oneof![Just(&[0u8][..]), Just(&[1u8][..])]
+}
+
+// A random column value
+fn column_value_strategy() -> impl Strategy<Value = ColumnValue> {
+    prop_oneof![
+        10 => string_strategy().prop_map(|s| ColumnValue::Str(s)),
+        1 => bytes_strategy().prop_map(|b| ColumnValue::Bytes(b)),
+        40 => num_strategy().prop_map(|n| ColumnValue::Numerical(n)),
+        1 => (1u16..3u16).prop_map(|ip_addr_byte| ColumnValue::IpAddr(Ipv6Addr::new(
+            127,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            ip_addr_byte
+        ))),
+        1 => any::<bool>().prop_map(|b| ColumnValue::Bool(b)),
+        1 => (0_679_723_993i64..1_679_723_995i64)
+            .prop_map(|val| { ColumnValue::DateTime(DateTime::from_timestamp_secs(val)) })
+    ]
+}
+
+// A document contains up to 4 values.
+fn doc_strategy() -> impl Strategy<Value = Vec<(&'static str, ColumnValue)>> {
+    proptest::collection::vec((column_name_strategy(), column_value_strategy()), 0..=4)
+}
+
+fn num_docs_strategy() -> impl Strategy<Value = usize> {
+    prop_oneof!(
+        // We focus heavily on the 0..2 case as we assume it is sufficient to cover all edge cases.
+        0usize..=3usize,
+        // We leave 50% of the effort exploring more defensively.
+        3usize..=12usize
+    )
+}
+
+// A columnar contains up to 2 docs.
+fn columnar_docs_strategy() -> impl Strategy<Value = Vec<Vec<(&'static str, ColumnValue)>>> {
+    num_docs_strategy()
+        .prop_flat_map(|num_docs| proptest::collection::vec(doc_strategy(), num_docs))
+}
+
+fn columnar_docs_and_mapping_strategy(
+) -> impl Strategy<Value = (Vec<Vec<(&'static str, ColumnValue)>>, Vec<RowId>)> {
+    columnar_docs_strategy().prop_flat_map(|docs| {
+        permutation_strategy(docs.len()).prop_map(move |permutation| (docs.clone(), permutation))
+    })
+}
+
+fn permutation_strategy(n: usize) -> impl Strategy<Value = Vec<RowId>> {
+    Just((0u32..n as RowId).collect()).prop_shuffle()
+}
+
+fn permutation_and_subset_strategy(n: usize) -> impl Strategy<Value = Vec<usize>> {
+    let vals: Vec<usize> = (0..n).collect();
+    subsequence(vals, 0..=n).prop_shuffle()
+}
+
+fn build_columnar_with_mapping(
+    docs: &[Vec<(&'static str, ColumnValue)>],
+    old_to_new_row_ids_opt: Option<&[RowId]>,
+) -> ColumnarReader {
+    let num_docs = docs.len() as u32;
+    let mut buffer = Vec::new();
+    let mut columnar_writer = ColumnarWriter::default();
+    for (doc_id, vals) in docs.iter().enumerate() {
+        for (column_name, col_val) in vals {
+            match *col_val {
+                ColumnValue::Str(str_val) => {
+                    columnar_writer.record_str(doc_id as u32, column_name, str_val);
+                }
+                ColumnValue::Bytes(bytes) => {
+                    columnar_writer.record_bytes(doc_id as u32, column_name, bytes)
+                }
+                ColumnValue::Numerical(num) => {
+                    columnar_writer.record_numerical(doc_id as u32, column_name, num);
+                }
+                ColumnValue::IpAddr(ip_addr) => {
+                    columnar_writer.record_ip_addr(doc_id as u32, column_name, ip_addr);
+                }
+                ColumnValue::Bool(bool_val) => {
+                    columnar_writer.record_bool(doc_id as u32, column_name, bool_val);
+                }
+                ColumnValue::DateTime(date_time) => {
+                    columnar_writer.record_datetime(doc_id as u32, column_name, date_time);
+                }
+            }
+        }
+    }
+    columnar_writer
+        .serialize(num_docs, old_to_new_row_ids_opt, &mut buffer)
+        .unwrap();
+    let columnar_reader = ColumnarReader::open(buffer).unwrap();
+    columnar_reader
+}
+
+fn build_columnar(docs: &[Vec<(&'static str, ColumnValue)>]) -> ColumnarReader {
+    build_columnar_with_mapping(docs, None)
+}
+
+fn assert_columnar_eq_strict(left: &ColumnarReader, right: &ColumnarReader) {
+    assert_columnar_eq(left, right, false);
+}
+
+fn assert_columnar_eq(
+    left: &ColumnarReader,
+    right: &ColumnarReader,
+    lenient_on_numerical_value: bool,
+) {
+    assert_eq!(left.num_rows(), right.num_rows());
+    let left_columns = left.list_columns().unwrap();
+    let right_columns = right.list_columns().unwrap();
+    assert_eq!(left_columns.len(), right_columns.len());
+    for i in 0..left_columns.len() {
+        assert_eq!(left_columns[i].0, right_columns[i].0);
+        let left_column = left_columns[i].1.open().unwrap();
+        let right_column = right_columns[i].1.open().unwrap();
+        assert_dyn_column_eq(&left_column, &right_column, lenient_on_numerical_value);
+    }
+}
+
+fn assert_column_eq<T: Copy + PartialOrd + Debug + Send + Sync + 'static>(
+    left: &Column<T>,
+    right: &Column<T>,
+) {
+    assert_eq!(left.get_cardinality(), right.get_cardinality());
+    assert_eq!(left.num_docs(), right.num_docs());
+    let num_docs = left.num_docs();
+    for doc in 0..num_docs {
+        assert_eq!(
+            left.index.value_row_ids(doc),
+            right.index.value_row_ids(doc)
+        );
+    }
+    assert_eq!(left.values.num_vals(), right.values.num_vals());
+    let num_vals = left.values.num_vals();
+    for i in 0..num_vals {
+        assert_eq!(left.values.get_val(i), right.values.get_val(i));
+    }
+}
+
+fn assert_bytes_column_eq(left: &BytesColumn, right: &BytesColumn) {
+    assert_eq!(
+        left.term_ord_column.get_cardinality(),
+        right.term_ord_column.get_cardinality()
+    );
+    assert_eq!(left.num_rows(), right.num_rows());
+    assert_column_eq(&left.term_ord_column, &right.term_ord_column);
+    assert_eq!(left.dictionary.num_terms(), right.dictionary.num_terms());
+    let num_terms = left.dictionary.num_terms();
+    let mut left_terms = left.dictionary.stream().unwrap();
+    let mut right_terms = right.dictionary.stream().unwrap();
+    for _ in 0..num_terms {
+        assert!(left_terms.advance());
+        assert!(right_terms.advance());
+        assert_eq!(left_terms.key(), right_terms.key());
+    }
+    assert!(!left_terms.advance());
+    assert!(!right_terms.advance());
+}
+
+fn assert_dyn_column_eq(
+    left_dyn_column: &DynamicColumn,
+    right_dyn_column: &DynamicColumn,
+    lenient_on_numerical_value: bool,
+) {
+    assert_eq!(
+        &left_dyn_column.get_cardinality(),
+        &right_dyn_column.get_cardinality()
+    );
+    match &(left_dyn_column, right_dyn_column) {
+        (DynamicColumn::Bool(left_col), DynamicColumn::Bool(right_col)) => {
+            assert_column_eq(left_col, right_col);
+        }
+        (DynamicColumn::I64(left_col), DynamicColumn::I64(right_col)) => {
+            assert_column_eq(left_col, right_col);
+        }
+        (DynamicColumn::U64(left_col), DynamicColumn::U64(right_col)) => {
+            assert_column_eq(left_col, right_col);
+        }
+        (DynamicColumn::F64(left_col), DynamicColumn::F64(right_col)) => {
+            assert_column_eq(left_col, right_col);
+        }
+        (DynamicColumn::DateTime(left_col), DynamicColumn::DateTime(right_col)) => {
+            assert_column_eq(left_col, right_col);
+        }
+        (DynamicColumn::IpAddr(left_col), DynamicColumn::IpAddr(right_col)) => {
+            assert_column_eq(left_col, right_col);
+        }
+        (DynamicColumn::Bytes(left_col), DynamicColumn::Bytes(right_col)) => {
+            assert_bytes_column_eq(left_col, right_col);
+        }
+        (DynamicColumn::Str(left_col), DynamicColumn::Str(right_col)) => {
+            assert_bytes_column_eq(left_col, right_col);
+        }
+        (left, right) => {
+            if lenient_on_numerical_value {
+                assert_eq!(
+                    ColumnTypeCategory::from(left.column_type()),
+                    ColumnTypeCategory::from(right.column_type())
+                );
+            } else {
+                panic!(
+                    "Column type are not the same: {:?} vs {:?}",
+                    left.column_type(),
+                    right.column_type()
+                );
+            }
+        }
+    }
+}
+
+trait AssertEqualToColumnValue {
+    fn assert_equal_to_column_value(&self, column_value: &ColumnValue);
+}
+
+impl AssertEqualToColumnValue for bool {
+    fn assert_equal_to_column_value(&self, column_value: &ColumnValue) {
+        let ColumnValue::Bool(val) = column_value else {
+            panic!()
+        };
+        assert_eq!(self, val);
+    }
+}
+
+impl AssertEqualToColumnValue for Ipv6Addr {
+    fn assert_equal_to_column_value(&self, column_value: &ColumnValue) {
+        let ColumnValue::IpAddr(val) = column_value else {
+            panic!()
+        };
+        assert_eq!(self, val);
+    }
+}
+
+impl<T: Coerce + PartialEq + Debug + Into<NumericalValue>> AssertEqualToColumnValue for T {
+    fn assert_equal_to_column_value(&self, column_value: &ColumnValue) {
+        let ColumnValue::Numerical(num) = column_value else {
+            panic!()
+        };
+        assert_eq!(self, &T::coerce(*num));
+    }
+}
+
+impl AssertEqualToColumnValue for DateTime {
+    fn assert_equal_to_column_value(&self, column_value: &ColumnValue) {
+        let ColumnValue::DateTime(dt) = column_value else {
+            panic!()
+        };
+        assert_eq!(self, dt);
+    }
+}
+
+fn assert_column_values<
+    T: AssertEqualToColumnValue + PartialEq + Copy + PartialOrd + Debug + Send + Sync + 'static,
+>(
+    col: &Column<T>,
+    expected: &HashMap<u32, Vec<&ColumnValue>>,
+) {
+    let mut num_non_empty_rows = 0;
+    for doc in 0..col.num_docs() {
+        let doc_vals: Vec<T> = col.values_for_doc(doc).collect();
+        if doc_vals.is_empty() {
+            continue;
+        }
+        num_non_empty_rows += 1;
+        let expected_vals = expected.get(&doc).unwrap();
+        assert_eq!(doc_vals.len(), expected_vals.len());
+        for (val, &expected) in doc_vals.iter().zip(expected_vals.iter()) {
+            val.assert_equal_to_column_value(expected)
+        }
+    }
+    assert_eq!(num_non_empty_rows, expected.len());
+}
+
+fn assert_bytes_column_values(
+    col: &BytesColumn,
+    expected: &HashMap<u32, Vec<&ColumnValue>>,
+    is_str: bool,
+) {
+    let mut num_non_empty_rows = 0;
+    let mut buffer = Vec::new();
+    for doc in 0..col.term_ord_column.num_docs() {
+        let doc_vals: Vec<u64> = col.term_ords(doc).collect();
+        if doc_vals.is_empty() {
+            continue;
+        }
+        let expected_vals = expected.get(&doc).unwrap();
+        assert_eq!(doc_vals.len(), expected_vals.len());
+        for (&expected_col_val, &ord) in expected_vals.iter().zip(&doc_vals) {
+            col.ord_to_bytes(ord, &mut buffer).unwrap();
+            match expected_col_val {
+                ColumnValue::Str(str_val) => {
+                    assert!(is_str);
+                    assert_eq!(str_val.as_bytes(), &buffer);
+                }
+                ColumnValue::Bytes(bytes_val) => {
+                    assert!(!is_str);
+                    assert_eq!(bytes_val, &buffer);
+                }
+                _ => {
+                    panic!();
+                }
+            }
+        }
+        num_non_empty_rows += 1;
+    }
+    assert_eq!(num_non_empty_rows, expected.len());
+}
+
+// This proptest attempts to create a tiny columnar based of up to 3 rows, and checks that the
+// resulting columnar matches the row data.
+proptest! {
+    #![proptest_config(ProptestConfig::with_cases(500))]
+    #[test]
+    fn test_single_columnar_builder_proptest(docs in columnar_docs_strategy()) {
+        let columnar = build_columnar(&docs[..]);
+        assert_eq!(columnar.num_rows() as usize, docs.len());
+        let mut expected_columns: HashMap<(&str, ColumnTypeCategory), HashMap<u32, Vec<&ColumnValue>> > = Default::default();
+        for (doc_id, doc_vals) in docs.iter().enumerate() {
+            for (col_name, col_val) in doc_vals {
+                expected_columns
+                    .entry((col_name, col_val.column_type_category()))
+                    .or_default()
+                    .entry(doc_id as u32)
+                    .or_default()
+                    .push(col_val);
+            }
+        }
+        let column_list = columnar.list_columns().unwrap();
+        assert_eq!(expected_columns.len(), column_list.len());
+        for (column_name, column) in column_list {
+            let dynamic_column = column.open().unwrap();
+            let col_category: ColumnTypeCategory = dynamic_column.column_type().into();
+            let expected_col_values: &HashMap<u32, Vec<&ColumnValue>> = expected_columns.get(&(column_name.as_str(), col_category)).unwrap();
+            match &dynamic_column {
+                DynamicColumn::Bool(col) =>
+                    assert_column_values(col, expected_col_values),
+                DynamicColumn::I64(col) =>
+                    assert_column_values(col, expected_col_values),
+                DynamicColumn::U64(col) =>
+                    assert_column_values(col, expected_col_values),
+                DynamicColumn::F64(col) =>
+                    assert_column_values(col, expected_col_values),
+                DynamicColumn::IpAddr(col) =>
+                    assert_column_values(col, expected_col_values),
+                DynamicColumn::DateTime(col) =>
+                    assert_column_values(col, expected_col_values),
+                DynamicColumn::Bytes(col) =>
+                    assert_bytes_column_values(col, expected_col_values, false),
+                DynamicColumn::Str(col) =>
+                    assert_bytes_column_values(col, expected_col_values, true),
+            }
+        }
+    }
+}
+
+// Same as `test_single_columnar_builder_proptest` but with a shuffling mapping.
+proptest! {
+    #![proptest_config(ProptestConfig::with_cases(500))]
+    #[test]
+    fn test_single_columnar_builder_with_shuffle_proptest((docs, mapping) in columnar_docs_and_mapping_strategy()) {
+        let columnar = build_columnar_with_mapping(&docs[..], Some(&mapping));
+        assert_eq!(columnar.num_rows() as usize, docs.len());
+        let mut expected_columns: HashMap<(&str, ColumnTypeCategory), HashMap<u32, Vec<&ColumnValue>> > = Default::default();
+        for (doc_id, doc_vals) in docs.iter().enumerate() {
+            for (col_name, col_val) in doc_vals {
+                expected_columns
+                    .entry((col_name, col_val.column_type_category()))
+                    .or_default()
+                    .entry(mapping[doc_id])
+                    .or_default()
+                    .push(col_val);
+            }
+        }
+        let column_list = columnar.list_columns().unwrap();
+        assert_eq!(expected_columns.len(), column_list.len());
+        for (column_name, column) in column_list {
+            let dynamic_column = column.open().unwrap();
+            let col_category: ColumnTypeCategory = dynamic_column.column_type().into();
+            let expected_col_values: &HashMap<u32, Vec<&ColumnValue>> = expected_columns.get(&(column_name.as_str(), col_category)).unwrap();
+            for _doc_id in 0..columnar.num_rows() {
+                match &dynamic_column {
+                    DynamicColumn::Bool(col) =>
+                        assert_column_values(col, expected_col_values),
+                    DynamicColumn::I64(col) =>
+                        assert_column_values(col, expected_col_values),
+                    DynamicColumn::U64(col) =>
+                        assert_column_values(col, expected_col_values),
+                    DynamicColumn::F64(col) =>
+                        assert_column_values(col, expected_col_values),
+                    DynamicColumn::IpAddr(col) =>
+                        assert_column_values(col, expected_col_values),
+                    DynamicColumn::DateTime(col) =>
+                        assert_column_values(col, expected_col_values),
+                    DynamicColumn::Bytes(col) =>
+                        assert_bytes_column_values(col, expected_col_values, false),
+                    DynamicColumn::Str(col) =>
+                        assert_bytes_column_values(col, expected_col_values, true),
+                }
+            }
+        }
+    }
+}
+
+// This tests create 2 or 3 random small columnar and attempts to merge them.
+// It compares the resulting merged dataframe with what would have been obtained by building the
+// dataframe from the concatenated rows to begin with.
+proptest! {
+    #![proptest_config(ProptestConfig::with_cases(1000))]
+    #[test]
+    fn test_columnar_merge_proptest(columnar_docs in proptest::collection::vec(columnar_docs_strategy(), 2..=3)) {
+        let columnar_readers: Vec<ColumnarReader> = columnar_docs.iter()
+            .map(|docs| build_columnar(&docs[..]))
+            .collect::<Vec<_>>();
+        let columnar_readers_arr: Vec<&ColumnarReader> = columnar_readers.iter().collect();
+        let mut output: Vec<u8> = Vec::new();
+        let stack_merge_order = StackMergeOrder::stack(&columnar_readers_arr[..]).into();
+        crate::merge_columnar(&columnar_readers_arr[..], &[], stack_merge_order, &mut output).unwrap();
+        let merged_columnar = ColumnarReader::open(output).unwrap();
+        let concat_rows: Vec<Vec<(&'static str, ColumnValue)>> = columnar_docs.iter().cloned().flatten().collect();
+        let expected_merged_columnar = build_columnar(&concat_rows[..]);
+        assert_columnar_eq_strict(&merged_columnar, &expected_merged_columnar);
+    }
+}
+
+#[test]
+fn test_columnar_merging_empty_columnar() {
+    let columnar_docs: Vec<Vec<Vec<(&str, ColumnValue)>>> =
+        vec![vec![], vec![vec![("c1", ColumnValue::Str("a"))]]];
+    let columnar_readers: Vec<ColumnarReader> = columnar_docs
+        .iter()
+        .map(|docs| build_columnar(&docs[..]))
+        .collect::<Vec<_>>();
+    let columnar_readers_arr: Vec<&ColumnarReader> = columnar_readers.iter().collect();
+    let mut output: Vec<u8> = Vec::new();
+    let stack_merge_order = StackMergeOrder::stack(&columnar_readers_arr[..]);
+    crate::merge_columnar(
+        &columnar_readers_arr[..],
+        &[],
+        crate::MergeRowOrder::Stack(stack_merge_order),
+        &mut output,
+    )
+    .unwrap();
+    let merged_columnar = ColumnarReader::open(output).unwrap();
+    let concat_rows: Vec<Vec<(&'static str, ColumnValue)>> =
+        columnar_docs.iter().cloned().flatten().collect();
+    let expected_merged_columnar = build_columnar(&concat_rows[..]);
+    assert_columnar_eq_strict(&merged_columnar, &expected_merged_columnar);
+}
+
+#[test]
+fn test_columnar_merging_number_columns() {
+    let columnar_docs: Vec<Vec<Vec<(&str, ColumnValue)>>> = vec![
+        // columnar 1
+        vec![
+            // doc 1.1
+            vec![("c2", ColumnValue::Numerical(0i64.into()))],
+        ],
+        // columnar2
+        vec![
+            // doc 2.1
+            vec![("c2", ColumnValue::Numerical(0u64.into()))],
+            // doc 2.2
+            vec![("c2", ColumnValue::Numerical(u64::MAX.into()))],
+        ],
+    ];
+    let columnar_readers: Vec<ColumnarReader> = columnar_docs
+        .iter()
+        .map(|docs| build_columnar(&docs[..]))
+        .collect::<Vec<_>>();
+    let columnar_readers_arr: Vec<&ColumnarReader> = columnar_readers.iter().collect();
+    let mut output: Vec<u8> = Vec::new();
+    let stack_merge_order = StackMergeOrder::stack(&columnar_readers_arr[..]);
+    crate::merge_columnar(
+        &columnar_readers_arr[..],
+        &[],
+        crate::MergeRowOrder::Stack(stack_merge_order),
+        &mut output,
+    )
+    .unwrap();
+    let merged_columnar = ColumnarReader::open(output).unwrap();
+    let concat_rows: Vec<Vec<(&'static str, ColumnValue)>> =
+        columnar_docs.iter().cloned().flatten().collect();
+    let expected_merged_columnar = build_columnar(&concat_rows[..]);
+    assert_columnar_eq_strict(&merged_columnar, &expected_merged_columnar);
+}
+
+// TODO add non trivial remap and merge
+// TODO test required_columns
+// TODO document edge case: required_columns incompatible with values.
+
+fn columnar_docs_and_remap(
+) -> impl Strategy<Value = (Vec<Vec<Vec<(&'static str, ColumnValue)>>>, Vec<RowAddr>)> {
+    proptest::collection::vec(columnar_docs_strategy(), 2..=3).prop_flat_map(
+        |columnars_docs: Vec<Vec<Vec<(&str, ColumnValue)>>>| {
+            let row_addrs: Vec<RowAddr> = columnars_docs
+                .iter()
+                .enumerate()
+                .flat_map(|(segment_ord, columnar_docs)| {
+                    (0u32..columnar_docs.len() as u32).map(move |row_id| RowAddr {
+                        segment_ord: segment_ord as u32,
+                        row_id,
+                    })
+                })
+                .collect();
+            permutation_and_subset_strategy(row_addrs.len()).prop_map(move |shuffled_subset| {
+                let shuffled_row_addr_subset: Vec<RowAddr> =
+                    shuffled_subset.iter().map(|ord| row_addrs[*ord]).collect();
+                (columnars_docs.clone(), shuffled_row_addr_subset)
+            })
+        },
+    )
+}
+
+proptest! {
+    #![proptest_config(ProptestConfig::with_cases(1000))]
+    #[test]
+    fn test_columnar_merge_and_remap_proptest((columnar_docs, shuffle_merge_order) in columnar_docs_and_remap()) {
+        let shuffled_rows: Vec<Vec<(&'static str, ColumnValue)>> = shuffle_merge_order.iter()
+            .map(|row_addr| columnar_docs[row_addr.segment_ord as usize][row_addr.row_id as usize].clone())
+            .collect();
+        let expected_merged_columnar = build_columnar(&shuffled_rows[..]);
+        let columnar_readers: Vec<ColumnarReader> = columnar_docs.iter()
+            .map(|docs| build_columnar(&docs[..]))
+            .collect::<Vec<_>>();
+        let columnar_readers_arr: Vec<&ColumnarReader> = columnar_readers.iter().collect();
+        let mut output: Vec<u8> = Vec::new();
+        let segment_num_rows: Vec<RowId> = columnar_docs.iter().map(|docs| docs.len() as RowId).collect();
+        let shuffle_merge_order = ShuffleMergeOrder::for_test(&segment_num_rows, shuffle_merge_order);
+        crate::merge_columnar(&columnar_readers_arr[..], &[], shuffle_merge_order.into(), &mut output).unwrap();
+        let merged_columnar = ColumnarReader::open(output).unwrap();
+        assert_columnar_eq(&merged_columnar, &expected_merged_columnar, true);
+    }
+}
+
+#[test]
+fn test_columnar_merge_empty() {
+    let columnar_reader_1 = build_columnar(&[]);
+    let rows: &[Vec<_>] = &[vec![("c1", ColumnValue::Str("a"))]][..];
+    let columnar_reader_2 = build_columnar(rows);
+    let mut output: Vec<u8> = Vec::new();
+    let segment_num_rows: Vec<RowId> = vec![0, 0];
+    let shuffle_merge_order = ShuffleMergeOrder::for_test(&segment_num_rows, vec![]);
+    crate::merge_columnar(
+        &[&columnar_reader_1, &columnar_reader_2],
+        &[],
+        shuffle_merge_order.into(),
+        &mut output,
+    )
+    .unwrap();
+    let merged_columnar = ColumnarReader::open(output).unwrap();
+    assert_eq!(merged_columnar.num_rows(), 0);
+    assert_eq!(merged_columnar.num_columns(), 0);
+}
+
+#[test]
+fn test_columnar_merge_single_str_column() {
+    let columnar_reader_1 = build_columnar(&[]);
+    let rows: &[Vec<_>] = &[vec![("c1", ColumnValue::Str("a"))]][..];
+    let columnar_reader_2 = build_columnar(rows);
+    let mut output: Vec<u8> = Vec::new();
+    let segment_num_rows: Vec<RowId> = vec![0, 1];
+    let shuffle_merge_order = ShuffleMergeOrder::for_test(
+        &segment_num_rows,
+        vec![RowAddr {
+            segment_ord: 1u32,
+            row_id: 0u32,
+        }],
+    );
+    crate::merge_columnar(
+        &[&columnar_reader_1, &columnar_reader_2],
+        &[],
+        shuffle_merge_order.into(),
+        &mut output,
+    )
+    .unwrap();
+    let merged_columnar = ColumnarReader::open(output).unwrap();
+    assert_eq!(merged_columnar.num_rows(), 1);
+    assert_eq!(merged_columnar.num_columns(), 1);
+}
+
+#[test]
+fn test_delete_decrease_cardinality() {
+    let columnar_reader_1 = build_columnar(&[]);
+    let rows: &[Vec<_>] = &[
+        vec![
+            ("c", ColumnValue::from(0i64)),
+            ("c", ColumnValue::from(0i64)),
+        ],
+        vec![("c", ColumnValue::from(0i64))],
+    ][..];
+    // c is multivalued here
+    let columnar_reader_2 = build_columnar(rows);
+    let mut output: Vec<u8> = Vec::new();
+    let shuffle_merge_order = ShuffleMergeOrder::for_test(
+        &[0, 2],
+        vec![RowAddr {
+            segment_ord: 1u32,
+            row_id: 1u32,
+        }],
+    );
+    crate::merge_columnar(
+        &[&columnar_reader_1, &columnar_reader_2],
+        &[],
+        shuffle_merge_order.into(),
+        &mut output,
+    )
+    .unwrap();
+    let merged_columnar = ColumnarReader::open(output).unwrap();
+    assert_eq!(merged_columnar.num_rows(), 1);
+    assert_eq!(merged_columnar.num_columns(), 1);
+    let cols = merged_columnar.read_columns("c").unwrap();
+    assert_eq!(cols.len(), 1);
+    assert_eq!(cols[0].column_type(), ColumnType::I64);
+    assert_eq!(cols[0].open().unwrap().get_cardinality(), Cardinality::Full);
+}
--- a/Show More
+++ b/Show More