Opportunistically seed forked block caches from current one.

Do not expose StoreReader::fork_cache and ::cache_key in the public API if only Searcher::docs_async uses them.
Add Searcher::docs_async which efficently fetches multiple documents by group them by segment and block.
2026-01-10 11:02:55 +00:00 · 2023-12-11 15:24:03 +01:00 · 2023-12-11 15:24:03 +01:00 · 2023-12-11 15:24:03 +01:00 · 2023-12-11 11:01:17 +01:00 · 2023-12-11 10:28:50 +01:00
61 changed files with 3133 additions and 1298 deletions
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -1,6 +1,6 @@
 [package]
 name = "tantivy"
-version = "0.21.0"
+version = "0.22.0-dev"
 authors = ["Paul Masurel <paul.masurel@gmail.com>"]
 license = "MIT"
 categories = ["database-implementations", "data-structures"]
@@ -22,7 +22,7 @@ crc32fast = "1.3.2"
 once_cell = "1.10.0"
 regex = { version = "1.5.5", default-features = false, features = ["std", "unicode"] }
 aho-corasick = "1.0"
-tantivy-fst = "0.4.0"
+tantivy-fst = "0.5"
 memmap2 = { version = "0.9.0", optional = true }
 lz4_flex = { version = "0.11", default-features = false, optional = true }
 zstd = { version = "0.13", optional = true, default-features = false }
@@ -37,21 +37,19 @@ uuid = { version = "1.0.0", features = ["v4", "serde"] }
 crossbeam-channel = "0.5.4"
 rust-stemmers = "1.2.0"
 downcast-rs = "1.2.0"
-bitpacking = { git = "https://github.com/quickwit-oss/bitpacking", rev = "f730b75", default-features = false, features = ["bitpacker4x"] }
+bitpacking = { version = "0.9.2", default-features = false, features = ["bitpacker4x"] }
 census = "0.4.0"
 rustc-hash = "1.1.0"
 thiserror = "1.0.30"
 htmlescape = "0.3.1"
 fail = { version = "0.5.0", optional = true }
 murmurhash32 = "0.3.0"
 time = { version = "0.3.10", features = ["serde-well-known"] }
 smallvec = "1.8.0"
 rayon = "1.5.2"
 lru = "0.12.0"
 fastdivide = "0.4.0"
-itertools = "0.11.0"
+itertools = "0.12.0"
 measure_time = "0.8.2"
 async-trait = "0.1.53"
 arc-swap = "1.5.0"
 columnar = { version= "0.2", path="./columnar", package ="tantivy-columnar" }
@@ -75,15 +73,13 @@ matches = "0.1.9"
 pretty_assertions = "1.2.1"
 proptest = "1.0.0"
 test-log = "0.2.10"
 env_logger = "0.10.0"
 futures = "0.3.21"
 paste = "1.0.11"
 more-asserts = "0.3.1"
 rand_distr = "0.4.3"
 [target.'cfg(not(windows))'.dev-dependencies]
-criterion = "0.5"
+criterion = { version = "0.5", default-features = false }
 pprof = { git = "https://github.com/PSeitz/pprof-rs/", rev = "53af24b", features = ["flamegraph", "criterion"] } # temp fork that works with criterion 0.5
 [dev-dependencies.fail]
 version = "0.5.0"
--- a/benches/index-bench.rs
+++ b/benches/index-bench.rs
@@ -1,14 +1,99 @@
-use criterion::{criterion_group, criterion_main, Criterion, Throughput};
+use criterion::{criterion_group, criterion_main, BatchSize, Bencher, Criterion, Throughput};
 use pprof::criterion::{Output, PProfProfiler};
 use tantivy::schema::{TantivyDocument, FAST, INDEXED, STORED, STRING, TEXT};
-use tantivy::{Index, IndexWriter};
+use tantivy::{tokenizer, Index, IndexWriter};
 const HDFS_LOGS: &str = include_str!("hdfs.json");
 const GH_LOGS: &str = include_str!("gh.json");
 const WIKI: &str = include_str!("wiki.json");
-fn get_lines(input: &str) -> Vec<&str> {
+fn benchmark(
-    input.trim().split('\n').collect()
+    b: &mut Bencher,
    input: &str,
    schema: tantivy::schema::Schema,
    commit: bool,
    parse_json: bool,
    is_dynamic: bool,
 ) {
    if is_dynamic {
        benchmark_dynamic_json(b, input, schema, commit, parse_json)
    } else {
        _benchmark(b, input, schema, commit, parse_json, |schema, doc_json| {
            TantivyDocument::parse_json(&schema, doc_json).unwrap()
        })
    }
 }
 fn get_index(schema: tantivy::schema::Schema) -> Index {
    let mut index = Index::create_in_ram(schema.clone());
    let ff_tokenizer_manager = tokenizer::TokenizerManager::default();
    ff_tokenizer_manager.register(
        "raw",
        tokenizer::TextAnalyzer::builder(tokenizer::RawTokenizer::default())
            .filter(tokenizer::RemoveLongFilter::limit(255))
            .build(),
    );
    index.set_fast_field_tokenizers(ff_tokenizer_manager.clone());
    index
 }
 fn _benchmark(
    b: &mut Bencher,
    input: &str,
    schema: tantivy::schema::Schema,
    commit: bool,
    include_json_parsing: bool,
    create_doc: impl Fn(&tantivy::schema::Schema, &str) -> TantivyDocument,
 ) {
    if include_json_parsing {
        let lines: Vec<&str> = input.trim().split('\n').collect();
        b.iter(|| {
            let index = get_index(schema.clone());
            let mut index_writer: IndexWriter =
                index.writer_with_num_threads(1, 100_000_000).unwrap();
            for doc_json in &lines {
                let doc = create_doc(&schema, doc_json);
                index_writer.add_document(doc).unwrap();
            }
            if commit {
                index_writer.commit().unwrap();
            }
        })
    } else {
        let docs: Vec<_> = input
            .trim()
            .split('\n')
            .map(|doc_json| create_doc(&schema, doc_json))
            .collect();
        b.iter_batched(
            || docs.clone(),
            |docs| {
                let index = get_index(schema.clone());
                let mut index_writer: IndexWriter =
                    index.writer_with_num_threads(1, 100_000_000).unwrap();
                for doc in docs {
                    index_writer.add_document(doc).unwrap();
                }
                if commit {
                    index_writer.commit().unwrap();
                }
            },
            BatchSize::SmallInput,
        )
    }
 }
 fn benchmark_dynamic_json(
    b: &mut Bencher,
    input: &str,
    schema: tantivy::schema::Schema,
    commit: bool,
    parse_json: bool,
 ) {
    let json_field = schema.get_field("json").unwrap();
    _benchmark(b, input, schema, commit, parse_json, |_schema, doc_json| {
        let json_val: serde_json::Map<String, serde_json::Value> =
            serde_json::from_str(doc_json).unwrap();
        tantivy::doc!(json_field=>json_val)
    })
 }
 pub fn hdfs_index_benchmark(c: &mut Criterion) {
@@ -19,7 +104,14 @@ pub fn hdfs_index_benchmark(c: &mut Criterion) {
        schema_builder.add_text_field("severity", STRING);
        schema_builder.build()
    };
-    let schema_with_store = {
+    let schema_only_fast = {
        let mut schema_builder = tantivy::schema::SchemaBuilder::new();
        schema_builder.add_u64_field("timestamp", FAST);
        schema_builder.add_text_field("body", FAST);
        schema_builder.add_text_field("severity", FAST);
        schema_builder.build()
    };
    let _schema_with_store = {
        let mut schema_builder = tantivy::schema::SchemaBuilder::new();
        schema_builder.add_u64_field("timestamp", INDEXED | STORED);
        schema_builder.add_text_field("body", TEXT | STORED);
@@ -28,77 +120,39 @@ pub fn hdfs_index_benchmark(c: &mut Criterion) {
    };
    let dynamic_schema = {
        let mut schema_builder = tantivy::schema::SchemaBuilder::new();
-        schema_builder.add_json_field("json", TEXT);
+        schema_builder.add_json_field("json", TEXT | FAST);
        schema_builder.build()
    };
    let mut group = c.benchmark_group("index-hdfs");
    group.throughput(Throughput::Bytes(HDFS_LOGS.len() as u64));
    group.sample_size(20);
-    group.bench_function("index-hdfs-no-commit", |b| {
+
-        let lines = get_lines(HDFS_LOGS);
+    let benches = [
-        b.iter(|| {
+        ("only-indexed-".to_string(), schema, false),
-            let index = Index::create_in_ram(schema.clone());
+        //("stored-".to_string(), _schema_with_store, false),
-            let index_writer: IndexWriter = index.writer_with_num_threads(1, 100_000_000).unwrap();
+        ("only-fast-".to_string(), schema_only_fast, false),
-            for doc_json in &lines {
+        ("dynamic-".to_string(), dynamic_schema, true),
-                let doc = TantivyDocument::parse_json(&schema, doc_json).unwrap();
+    ];
-                index_writer.add_document(doc).unwrap();
+
    for (prefix, schema, is_dynamic) in benches {
        for commit in [false, true] {
            let suffix = if commit { "with-commit" } else { "no-commit" };
            for parse_json in [false] {
                // for parse_json in [false, true] {
                let suffix = if parse_json {
                    format!("{}-with-json-parsing", suffix)
                } else {
                    format!("{}", suffix)
                };
                let bench_name = format!("{}{}", prefix, suffix);
                group.bench_function(bench_name, |b| {
                    benchmark(b, HDFS_LOGS, schema.clone(), commit, parse_json, is_dynamic)
                });
            }
-        })
+        }
-    });
+    }
    group.bench_function("index-hdfs-with-commit", |b| {
        let lines = get_lines(HDFS_LOGS);
        b.iter(|| {
            let index = Index::create_in_ram(schema.clone());
            let mut index_writer: IndexWriter =
                index.writer_with_num_threads(1, 100_000_000).unwrap();
            for doc_json in &lines {
                let doc = TantivyDocument::parse_json(&schema, doc_json).unwrap();
                index_writer.add_document(doc).unwrap();
            }
            index_writer.commit().unwrap();
        })
    });
    group.bench_function("index-hdfs-no-commit-with-docstore", |b| {
        let lines = get_lines(HDFS_LOGS);
        b.iter(|| {
            let index = Index::create_in_ram(schema_with_store.clone());
            let index_writer: IndexWriter = index.writer_with_num_threads(1, 100_000_000).unwrap();
            for doc_json in &lines {
                let doc = TantivyDocument::parse_json(&schema, doc_json).unwrap();
                index_writer.add_document(doc).unwrap();
            }
        })
    });
    group.bench_function("index-hdfs-with-commit-with-docstore", |b| {
        let lines = get_lines(HDFS_LOGS);
        b.iter(|| {
            let index = Index::create_in_ram(schema_with_store.clone());
            let mut index_writer: IndexWriter =
                index.writer_with_num_threads(1, 100_000_000).unwrap();
            for doc_json in &lines {
                let doc = TantivyDocument::parse_json(&schema, doc_json).unwrap();
                index_writer.add_document(doc).unwrap();
            }
            index_writer.commit().unwrap();
        })
    });
    group.bench_function("index-hdfs-no-commit-json-without-docstore", |b| {
        let lines = get_lines(HDFS_LOGS);
        b.iter(|| {
            let index = Index::create_in_ram(dynamic_schema.clone());
            let json_field = dynamic_schema.get_field("json").unwrap();
            let mut index_writer: IndexWriter =
                index.writer_with_num_threads(1, 100_000_000).unwrap();
            for doc_json in &lines {
                let json_val: serde_json::Map<String, serde_json::Value> =
                    serde_json::from_str(doc_json).unwrap();
                let doc = tantivy::doc!(json_field=>json_val);
                index_writer.add_document(doc).unwrap();
            }
            index_writer.commit().unwrap();
        })
    });
 }
 pub fn gh_index_benchmark(c: &mut Criterion) {
@@ -107,39 +161,24 @@ pub fn gh_index_benchmark(c: &mut Criterion) {
        schema_builder.add_json_field("json", TEXT | FAST);
        schema_builder.build()
    };
    let dynamic_schema_fast = {
        let mut schema_builder = tantivy::schema::SchemaBuilder::new();
        schema_builder.add_json_field("json", FAST);
        schema_builder.build()
    };
    let mut group = c.benchmark_group("index-gh");
    group.throughput(Throughput::Bytes(GH_LOGS.len() as u64));
    group.bench_function("index-gh-no-commit", |b| {
-        let lines = get_lines(GH_LOGS);
+        benchmark_dynamic_json(b, GH_LOGS, dynamic_schema.clone(), false, false)
        b.iter(|| {
            let json_field = dynamic_schema.get_field("json").unwrap();
            let index = Index::create_in_ram(dynamic_schema.clone());
            let index_writer: IndexWriter = index.writer_with_num_threads(1, 100_000_000).unwrap();
            for doc_json in &lines {
                let json_val: serde_json::Map<String, serde_json::Value> =
                    serde_json::from_str(doc_json).unwrap();
                let doc = tantivy::doc!(json_field=>json_val);
                index_writer.add_document(doc).unwrap();
            }
        })
    });
-    group.bench_function("index-gh-with-commit", |b| {
+    group.bench_function("index-gh-fast", |b| {
-        let lines = get_lines(GH_LOGS);
+        benchmark_dynamic_json(b, GH_LOGS, dynamic_schema_fast.clone(), false, false)
-        b.iter(|| {
+    });
-            let json_field = dynamic_schema.get_field("json").unwrap();
+
-            let index = Index::create_in_ram(dynamic_schema.clone());
+    group.bench_function("index-gh-fast-with-commit", |b| {
-            let mut index_writer: IndexWriter =
+        benchmark_dynamic_json(b, GH_LOGS, dynamic_schema_fast.clone(), true, false)
                index.writer_with_num_threads(1, 100_000_000).unwrap();
            for doc_json in &lines {
                let json_val: serde_json::Map<String, serde_json::Value> =
                    serde_json::from_str(doc_json).unwrap();
                let doc = tantivy::doc!(json_field=>json_val);
                index_writer.add_document(doc).unwrap();
            }
            index_writer.commit().unwrap();
        })
    });
 }
@@ -154,34 +193,10 @@ pub fn wiki_index_benchmark(c: &mut Criterion) {
    group.throughput(Throughput::Bytes(WIKI.len() as u64));
    group.bench_function("index-wiki-no-commit", |b| {
-        let lines = get_lines(WIKI);
+        benchmark_dynamic_json(b, WIKI, dynamic_schema.clone(), false, false)
        b.iter(|| {
            let json_field = dynamic_schema.get_field("json").unwrap();
            let index = Index::create_in_ram(dynamic_schema.clone());
            let index_writer: IndexWriter = index.writer_with_num_threads(1, 100_000_000).unwrap();
            for doc_json in &lines {
                let json_val: serde_json::Map<String, serde_json::Value> =
                    serde_json::from_str(doc_json).unwrap();
                let doc = tantivy::doc!(json_field=>json_val);
                index_writer.add_document(doc).unwrap();
            }
        })
    });
    group.bench_function("index-wiki-with-commit", |b| {
-        let lines = get_lines(WIKI);
+        benchmark_dynamic_json(b, WIKI, dynamic_schema.clone(), true, false)
        b.iter(|| {
            let json_field = dynamic_schema.get_field("json").unwrap();
            let index = Index::create_in_ram(dynamic_schema.clone());
            let mut index_writer: IndexWriter =
                index.writer_with_num_threads(1, 100_000_000).unwrap();
            for doc_json in &lines {
                let json_val: serde_json::Map<String, serde_json::Value> =
                    serde_json::from_str(doc_json).unwrap();
                let doc = tantivy::doc!(json_field=>json_val);
                index_writer.add_document(doc).unwrap();
            }
            index_writer.commit().unwrap();
        })
    });
 }
@@ -192,12 +207,12 @@ criterion_group! {
 }
 criterion_group! {
    name = gh_benches;
-    config = Criterion::default().with_profiler(PProfProfiler::new(100, Output::Flamegraph(None)));
+    config = Criterion::default();
    targets = gh_index_benchmark
 }
 criterion_group! {
    name = wiki_benches;
-    config = Criterion::default().with_profiler(PProfProfiler::new(100, Output::Flamegraph(None)));
+    config = Criterion::default();
    targets = wiki_index_benchmark
 }
 criterion_main!(benches, gh_benches, wiki_benches);
--- a/bitpacker/Cargo.toml
+++ b/bitpacker/Cargo.toml
@@ -15,7 +15,7 @@ homepage = "https://github.com/quickwit-oss/tantivy"
 # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
 [dependencies]
-bitpacking = {version="0.8", default-features=false, features = ["bitpacker1x"]}
+bitpacking = { version = "0.9.2", default-features = false, features = ["bitpacker1x"] }
 [dev-dependencies]
 rand = "0.8"
--- a/columnar/Cargo.toml
+++ b/columnar/Cargo.toml
@@ -9,8 +9,7 @@ description = "column oriented storage for tantivy"
 categories = ["database-implementations", "data-structures", "compression"]
 [dependencies]
-itertools = "0.11.0"
+itertools = "0.12.0"
 fnv = "1.0.7"
 fastdivide = "0.4.0"
 stacker = { version= "0.2", path = "../stacker", package="tantivy-stacker"}
--- a/columnar/columnar-cli/Cargo.toml
+++ b/columnar/columnar-cli/Cargo.toml
@@ -8,7 +8,6 @@ license = "MIT"
 columnar = {path="../", package="tantivy-columnar"}
 serde_json = "1"
 serde_json_borrow = {git="https://github.com/PSeitz/serde_json_borrow/"}
 serde = "1"
 [workspace]
 members = []
--- a/columnar/src/columnar/writer/column_writers.rs
+++ b/columnar/src/columnar/writer/column_writers.rs
@@ -269,7 +269,8 @@ impl StrOrBytesColumnWriter {
        dictionaries: &mut [DictionaryBuilder],
        arena: &mut MemoryArena,
    ) {
-        let unordered_id = dictionaries[self.dictionary_id as usize].get_or_allocate_id(bytes);
+        let unordered_id =
            dictionaries[self.dictionary_id as usize].get_or_allocate_id(bytes, arena);
        self.column_writer.record(doc, unordered_id, arena);
    }
--- a/columnar/src/columnar/writer/mod.rs
+++ b/columnar/src/columnar/writer/mod.rs
@@ -437,6 +437,7 @@ impl ColumnarWriter {
                            &mut symbol_byte_buffer,
                        ),
                        buffers,
                        &self.arena,
                        &mut column_serializer,
                    )?;
                    column_serializer.finalize()?;
@@ -490,6 +491,7 @@ impl ColumnarWriter {
 // Serialize [Dictionary, Column, dictionary num bytes U32::LE]
 // Column: [Column Index, Column Values, column index num bytes U32::LE]
 #[allow(clippy::too_many_arguments)]
 fn serialize_bytes_or_str_column(
    cardinality: Cardinality,
    num_docs: RowId,
@@ -497,6 +499,7 @@ fn serialize_bytes_or_str_column(
    dictionary_builder: &DictionaryBuilder,
    operation_it: impl Iterator<Item = ColumnOperation<UnorderedId>>,
    buffers: &mut SpareBuffers,
    arena: &MemoryArena,
    wrt: impl io::Write,
 ) -> io::Result<()> {
    let SpareBuffers {
@@ -505,7 +508,8 @@ fn serialize_bytes_or_str_column(
        ..
    } = buffers;
    let mut counting_writer = CountingWriter::wrap(wrt);
-    let term_id_mapping: TermIdMapping = dictionary_builder.serialize(&mut counting_writer)?;
+    let term_id_mapping: TermIdMapping =
        dictionary_builder.serialize(arena, &mut counting_writer)?;
    let dictionary_num_bytes: u32 = counting_writer.written_bytes() as u32;
    let mut wrt = counting_writer.finish();
    let operation_iterator = operation_it.map(|symbol: ColumnOperation<UnorderedId>| {
--- a/columnar/src/dictionary.rs
+++ b/columnar/src/dictionary.rs
@@ -1,7 +1,7 @@
 use std::io;
 use fnv::FnvHashMap;
 use sstable::SSTable;
 use stacker::{MemoryArena, SharedArenaHashMap};
 pub(crate) struct TermIdMapping {
    unordered_to_ord: Vec<OrderedId>,
@@ -31,29 +31,38 @@ pub struct OrderedId(pub u32);
 /// mapping.
 #[derive(Default)]
 pub(crate) struct DictionaryBuilder {
-    dict: FnvHashMap<Vec<u8>, UnorderedId>,
+    dict: SharedArenaHashMap,
    memory_consumption: usize,
 }
 impl DictionaryBuilder {
    /// Get or allocate an unordered id.
    /// (This ID is simply an auto-incremented id.)
-    pub fn get_or_allocate_id(&mut self, term: &[u8]) -> UnorderedId {
+    pub fn get_or_allocate_id(&mut self, term: &[u8], arena: &mut MemoryArena) -> UnorderedId {
-        if let Some(term_id) = self.dict.get(term) {
+        let next_id = self.dict.len() as u32;
-            return *term_id;
+        let unordered_id = self
-        }
+            .dict
-        let new_id = UnorderedId(self.dict.len() as u32);
+            .mutate_or_create(term, arena, |unordered_id: Option<u32>| {
-        self.dict.insert(term.to_vec(), new_id);
+                if let Some(unordered_id) = unordered_id {
-        self.memory_consumption += term.len();
+                    unordered_id
-        self.memory_consumption += 40; // Term Metadata + HashMap overhead
+                } else {
-        new_id
+                    next_id
                }
            });
        UnorderedId(unordered_id)
    }
    /// Serialize the dictionary into an fst, and returns the
    /// `UnorderedId -> TermOrdinal` map.
-    pub fn serialize<'a, W: io::Write + 'a>(&self, wrt: &mut W) -> io::Result<TermIdMapping> {
+    pub fn serialize<'a, W: io::Write + 'a>(
-        let mut terms: Vec<(&[u8], UnorderedId)> =
+        &self,
-            self.dict.iter().map(|(k, v)| (k.as_slice(), *v)).collect();
+        arena: &MemoryArena,
        wrt: &mut W,
    ) -> io::Result<TermIdMapping> {
        let mut terms: Vec<(&[u8], UnorderedId)> = self
            .dict
            .iter(arena)
            .map(|(k, v)| (k, arena.read(v)))
            .collect();
        terms.sort_unstable_by_key(|(key, _)| *key);
        // TODO Remove the allocation.
        let mut unordered_to_ord: Vec<OrderedId> = vec![OrderedId(0u32); terms.len()];
@@ -68,7 +77,7 @@ impl DictionaryBuilder {
    }
    pub(crate) fn mem_usage(&self) -> usize {
-        self.memory_consumption
+        self.dict.mem_usage()
    }
 }
@@ -78,12 +87,13 @@ mod tests {
    #[test]
    fn test_dictionary_builder() {
        let mut arena = MemoryArena::default();
        let mut dictionary_builder = DictionaryBuilder::default();
-        let hello_uid = dictionary_builder.get_or_allocate_id(b"hello");
+        let hello_uid = dictionary_builder.get_or_allocate_id(b"hello", &mut arena);
-        let happy_uid = dictionary_builder.get_or_allocate_id(b"happy");
+        let happy_uid = dictionary_builder.get_or_allocate_id(b"happy", &mut arena);
-        let tax_uid = dictionary_builder.get_or_allocate_id(b"tax");
+        let tax_uid = dictionary_builder.get_or_allocate_id(b"tax", &mut arena);
        let mut buffer = Vec::new();
-        let id_mapping = dictionary_builder.serialize(&mut buffer).unwrap();
+        let id_mapping = dictionary_builder.serialize(&arena, &mut buffer).unwrap();
        assert_eq!(id_mapping.to_ord(hello_uid), OrderedId(1));
        assert_eq!(id_mapping.to_ord(happy_uid), OrderedId(0));
        assert_eq!(id_mapping.to_ord(tax_uid), OrderedId(2));
--- a/columnar/src/lib.rs
+++ b/columnar/src/lib.rs
@@ -1,3 +1,22 @@
 //! # Tantivy-Columnar
 //!
 //! `tantivy-columnar`provides a columnar storage for tantivy.
 //! The crate allows for efficient read operations on specific columns rather than entire records.
 //!
 //! ## Overview
 //!
 //! - **columnar**: Reading, writing, and merging multiple columns:
 //!   - **[ColumnarWriter]**: Makes it possible to create a new columnar.
 //!   - **[ColumnarReader]**: The ColumnarReader makes it possible to access a set of columns
 //!     associated to field names.
 //!   - **[merge_columnar]**: Contains the functionalities to merge multiple ColumnarReader or
 //!     segments into a single one.
 //!
 //! - **column**: A single column, which contains
 //!     - [column_index]: Resolves the rows for a document id. Manages the cardinality of the
 //!       column.
 //!     - [column_values]: Stores the values of a column in a dense format.
 #![cfg_attr(all(feature = "unstable", test), feature(test))]
 #[cfg(test)]
--- a/columnar/src/tests.rs
+++ b/columnar/src/tests.rs
@@ -26,7 +26,7 @@ fn test_dataframe_writer_str() {
    assert_eq!(columnar.num_columns(), 1);
    let cols: Vec<DynamicColumnHandle> = columnar.read_columns("my_string").unwrap();
    assert_eq!(cols.len(), 1);
-    assert_eq!(cols[0].num_bytes(), 99);
+    assert_eq!(cols[0].num_bytes(), 73);
 }
 #[test]
@@ -40,7 +40,7 @@ fn test_dataframe_writer_bytes() {
    assert_eq!(columnar.num_columns(), 1);
    let cols: Vec<DynamicColumnHandle> = columnar.read_columns("my_string").unwrap();
    assert_eq!(cols.len(), 1);
-    assert_eq!(cols[0].num_bytes(), 99);
+    assert_eq!(cols[0].num_bytes(), 73);
 }
 #[test]
--- a/ownedbytes/src/lib.rs
+++ b/ownedbytes/src/lib.rs
@@ -1,5 +1,5 @@
 use std::convert::TryInto;
-use std::ops::Deref;
+use std::ops::{Deref, Range};
 use std::sync::Arc;
 use std::{fmt, io};
@@ -37,7 +37,7 @@ impl OwnedBytes {
    /// creates a fileslice that is just a view over a slice of the data.
    #[must_use]
    #[inline]
-    pub fn slice(&self, range: impl std::slice::SliceIndex<[u8], Output = [u8]>) -> Self {
+    pub fn slice(&self, range: Range<usize>) -> Self {
        OwnedBytes {
            data: &self.data[range],
            box_stable_deref: self.box_stable_deref.clone(),
--- a/src/aggregation/agg_tests.rs
+++ b/src/aggregation/agg_tests.rs
@@ -624,6 +624,65 @@ fn test_aggregation_on_json_object() {
    );
 }
 #[test]
 fn test_aggregation_on_nested_json_object() {
    let mut schema_builder = Schema::builder();
    let json = schema_builder.add_json_field("json.blub", FAST);
    let schema = schema_builder.build();
    let index = Index::create_in_ram(schema);
    let mut index_writer: IndexWriter = index.writer_for_tests().unwrap();
    index_writer
        .add_document(doc!(json => json!({"color.dot": "red", "color": {"nested":"red"} })))
        .unwrap();
    index_writer
        .add_document(doc!(json => json!({"color.dot": "blue", "color": {"nested":"blue"} })))
        .unwrap();
    index_writer.commit().unwrap();
    let reader = index.reader().unwrap();
    let searcher = reader.searcher();
    let agg: Aggregations = serde_json::from_value(json!({
        "jsonagg1": {
            "terms": {
                "field": "json\\.blub.color\\.dot",
            }
        },
        "jsonagg2": {
            "terms": {
                "field": "json\\.blub.color.nested",
            }
        }
    }))
    .unwrap();
    let aggregation_collector = get_collector(agg);
    let aggregation_results = searcher.search(&AllQuery, &aggregation_collector).unwrap();
    let aggregation_res_json = serde_json::to_value(aggregation_results).unwrap();
    assert_eq!(
        &aggregation_res_json,
        &serde_json::json!({
            "jsonagg1": {
                "buckets": [
                    {"doc_count": 1, "key": "blue"},
                    {"doc_count": 1, "key": "red"}
                ],
                "doc_count_error_upper_bound": 0,
                "sum_other_doc_count": 0
            },
            "jsonagg2": {
                "buckets": [
                    {"doc_count": 1, "key": "blue"},
                    {"doc_count": 1, "key": "red"}
                ],
                "doc_count_error_upper_bound": 0,
                "sum_other_doc_count": 0
            }
        })
    );
 }
 #[test]
 fn test_aggregation_on_json_object_empty_columns() {
    let mut schema_builder = Schema::builder();
--- a/src/core/index.rs
+++ b/src/core/index.rs
@@ -23,6 +23,7 @@ use crate::reader::{IndexReader, IndexReaderBuilder};
 use crate::schema::document::Document;
 use crate::schema::{Field, FieldType, Schema};
 use crate::tokenizer::{TextAnalyzer, TokenizerManager};
 use crate::{merge_field_meta_data, FieldMetadata, SegmentReader};
 fn load_metas(
    directory: &dyn Directory,
@@ -489,6 +490,28 @@ impl Index {
        self.inventory.all()
    }
    /// Returns the list of fields that have been indexed in the Index.
    /// The field list includes the field defined in the schema as well as the fields
    /// that have been indexed as a part of a JSON field.
    /// The returned field name is the full field name, including the name of the JSON field.
    ///
    /// The returned field names can be used in queries.
    ///
    /// Notice: If your data contains JSON fields this is **very expensive**, as it requires
    /// browsing through the inverted index term dictionary and the columnar field dictionary.
    ///
    /// Disclaimer: Some fields may not be listed here. For instance, if the schema contains a json
    /// field that is not indexed nor a fast field but is stored, it is possible for the field
    /// to not be listed.
    pub fn fields_metadata(&self) -> crate::Result<Vec<FieldMetadata>> {
        let segments = self.searchable_segments()?;
        let fields_metadata: Vec<Vec<FieldMetadata>> = segments
            .into_iter()
            .map(|segment| SegmentReader::open(&segment)?.fields_metadata())
            .collect::<Result<_, _>>()?;
        Ok(merge_field_meta_data(fields_metadata, &self.schema()))
    }
    /// Creates a new segment_meta (Advanced user only).
    ///
    /// As long as the `SegmentMeta` lives, the files associated with the
--- a/src/core/inverted_index_reader.rs
+++ b/src/core/inverted_index_reader.rs
@@ -75,7 +75,7 @@ impl InvertedIndexReader {
    ///
    /// Notice: This requires a full scan and therefore **very expensive**.
    /// TODO: Move to sstable to use the index.
-    pub fn list_fields(&self) -> io::Result<Vec<(String, Type)>> {
+    pub fn list_encoded_fields(&self) -> io::Result<Vec<(String, Type)>> {
        let mut stream = self.termdict.stream()?;
        let mut fields = Vec::new();
        let mut fields_set = FnvHashSet::default();
--- a/src/core/json_utils.rs
+++ b/src/core/json_utils.rs
@@ -62,6 +62,14 @@ impl IndexingPositionsPerPath {
    }
 }
 /// Convert JSON_PATH_SEGMENT_SEP to a dot.
 pub fn json_path_sep_to_dot(path: &mut str) {
    // This is safe since we are replacing a ASCII character by another ASCII character.
    unsafe {
        replace_in_place(JSON_PATH_SEGMENT_SEP, b'.', path.as_bytes_mut());
    }
 }
 #[allow(clippy::too_many_arguments)]
 pub(crate) fn index_json_values<'a, V: Value<'a>>(
    doc: DocId,
@@ -320,7 +328,7 @@ pub struct JsonTermWriter<'a> {
 /// In other words,
 /// - `k8s.node` ends up as `["k8s", "node"]`.
 /// - `k8s\.node` ends up as `["k8s.node"]`.
-fn split_json_path(json_path: &str) -> Vec<String> {
+pub fn split_json_path(json_path: &str) -> Vec<String> {
    let mut escaped_state: bool = false;
    let mut json_path_segments = Vec::new();
    let mut buffer = String::new();
--- a/src/core/mod.rs
+++ b/src/core/mod.rs
@@ -25,7 +25,7 @@ pub use self::searcher::{Searcher, SearcherGeneration};
 pub use self::segment::Segment;
 pub use self::segment_component::SegmentComponent;
 pub use self::segment_id::SegmentId;
-pub use self::segment_reader::SegmentReader;
+pub use self::segment_reader::{merge_field_meta_data, FieldMetadata, SegmentReader};
 pub use self::single_segment_index_writer::SingleSegmentIndexWriter;
 /// The meta file contains all the information about the list of segments and the schema
--- a/src/core/searcher.rs
+++ b/src/core/searcher.rs
@@ -1,4 +1,6 @@
 use std::collections::BTreeMap;
 #[cfg(feature = "quickwit")]
 use std::future::Future;
 use std::sync::Arc;
 use std::{fmt, io};
@@ -112,6 +114,108 @@ impl Searcher {
        store_reader.get_async(doc_address.doc_id).await
    }
    /// Fetches multiple documents in an asynchronous manner.
    ///
    /// This method is more efficient than calling [`doc_async`](Self::doc_async) multiple times,
    /// as it groups overlapping requests to segments and blocks and avoids concurrent requests
    /// trashing the caches of each other. However, it does so using intermediate data structures
    /// and independent block caches so it will be slower if documents from very few blocks are
    /// fetched which would have fit into the global block cache.
    ///
    /// The caller is expected to poll these futures concurrently (e.g. using `FuturesUnordered`)
    /// or in parallel (e.g. using `JoinSet`) as fits best with the given use case, i.e. whether
    /// it is predominately I/O-bound or rather CPU-bound.
    ///
    /// Note that any blocks brought into any of the per-segment-and-block groups will not be pulled
    /// into the global block cache and hence not be available for subsequent calls.
    ///
    /// Note that there is no synchronous variant of this method as the same degree of efficiency
    /// can be had by accessing documents in address order.
    ///
    /// # Example
    ///
    /// ```rust,no_run
    /// # use futures::executor::block_on;
    /// # use futures::stream::{FuturesUnordered, StreamExt};
    /// #
    /// # use tantivy::schema::Schema;
    /// # use tantivy::{DocAddress, Index, TantivyDocument, TantivyError};
    /// #
    /// # let index = Index::create_in_ram(Schema::builder().build());
    /// # let searcher = index.reader()?.searcher();
    /// #
    /// # let doc_addresses = (0..10).map(|_| DocAddress::new(0, 0));
    /// #
    /// let mut groups: FuturesUnordered<_> = searcher
    ///     .docs_async::<TantivyDocument>(doc_addresses)?
    ///     .collect();
    ///
    /// let mut docs = Vec::new();
    ///
    /// block_on(async {
    ///     while let Some(group) = groups.next().await {
    ///         docs.extend(group?);
    ///     }
    ///
    ///     Ok::<_, TantivyError>(())
    /// })?;
    /// #
    /// # Ok::<_, TantivyError>(())
    /// ```
    #[cfg(feature = "quickwit")]
    pub fn docs_async<D: DocumentDeserialize>(
        &self,
        doc_addresses: impl IntoIterator<Item = DocAddress>,
    ) -> crate::Result<
        impl Iterator<Item = impl Future<Output = crate::Result<Vec<(DocAddress, D)>>>> + '_,
    > {
        use rustc_hash::FxHashMap;
        use crate::store::CacheKey;
        use crate::{DocId, SegmentOrdinal};
        let mut groups: FxHashMap<(SegmentOrdinal, CacheKey), Vec<DocId>> = Default::default();
        for doc_address in doc_addresses {
            let store_reader = &self.inner.store_readers[doc_address.segment_ord as usize];
            let cache_key = store_reader.cache_key(doc_address.doc_id)?;
            groups
                .entry((doc_address.segment_ord, cache_key))
                .or_default()
                .push(doc_address.doc_id);
        }
        let futures = groups
            .into_iter()
            .map(|((segment_ord, cache_key), doc_ids)| {
                // Each group fetches documents from exactly one block and
                // therefore gets an independent block cache of size one.
                let store_reader =
                    self.inner.store_readers[segment_ord as usize].fork_cache(1, &[cache_key]);
                async move {
                    let mut docs = Vec::new();
                    for doc_id in doc_ids {
                        let doc = store_reader.get_async(doc_id).await?;
                        docs.push((
                            DocAddress {
                                segment_ord,
                                doc_id,
                            },
                            doc,
                        ));
                    }
                    Ok(docs)
                }
            });
        Ok(futures)
    }
    /// Access the schema associated with the index of this searcher.
    pub fn schema(&self) -> &Schema {
        &self.inner.schema
--- a/src/core/segment_reader.rs
+++ b/src/core/segment_reader.rs
@@ -1,12 +1,17 @@
 use std::collections::HashMap;
 use std::ops::BitOrAssign;
 use std::sync::{Arc, RwLock};
 use std::{fmt, io};
 use fnv::FnvHashMap;
 use itertools::Itertools;
 use crate::core::{InvertedIndexReader, Segment, SegmentComponent, SegmentId};
 use crate::directory::{CompositeFile, FileSlice};
 use crate::error::DataCorruption;
 use crate::fastfield::{intersect_alive_bitsets, AliveBitSet, FacetReader, FastFieldReaders};
 use crate::fieldnorm::{FieldNormReader, FieldNormReaders};
 use crate::json_utils::json_path_sep_to_dot;
 use crate::schema::{Field, IndexRecordOption, Schema, Type};
 use crate::space_usage::SegmentSpaceUsage;
 use crate::store::StoreReader;
@@ -280,6 +285,103 @@ impl SegmentReader {
        Ok(inv_idx_reader)
    }
    /// Returns the list of fields that have been indexed in the segment.
    /// The field list includes the field defined in the schema as well as the fields
    /// that have been indexed as a part of a JSON field.
    /// The returned field name is the full field name, including the name of the JSON field.
    ///
    /// The returned field names can be used in queries.
    ///
    /// Notice: If your data contains JSON fields this is **very expensive**, as it requires
    /// browsing through the inverted index term dictionary and the columnar field dictionary.
    ///
    /// Disclaimer: Some fields may not be listed here. For instance, if the schema contains a json
    /// field that is not indexed nor a fast field but is stored, it is possible for the field
    /// to not be listed.
    pub fn fields_metadata(&self) -> crate::Result<Vec<FieldMetadata>> {
        let mut indexed_fields: Vec<FieldMetadata> = Vec::new();
        let mut map_to_canonical = FnvHashMap::default();
        for (field, field_entry) in self.schema().fields() {
            let field_name = field_entry.name().to_string();
            let is_indexed = field_entry.is_indexed();
            if is_indexed {
                let is_json = field_entry.field_type().value_type() == Type::Json;
                if is_json {
                    let inv_index = self.inverted_index(field)?;
                    let encoded_fields_in_index = inv_index.list_encoded_fields()?;
                    let mut build_path = |field_name: &str, mut json_path: String| {
                        // In this case we need to map the potential fast field to the field name
                        // accepted by the query parser.
                        let create_canonical =
                            !field_entry.is_expand_dots_enabled() && json_path.contains('.');
                        if create_canonical {
                            // Without expand dots enabled dots need to be escaped.
                            let escaped_json_path = json_path.replace('.', "\\.");
                            let full_path = format!("{}.{}", field_name, escaped_json_path);
                            let full_path_unescaped = format!("{}.{}", field_name, &json_path);
                            map_to_canonical.insert(full_path_unescaped, full_path.to_string());
                            full_path
                        } else {
                            // With expand dots enabled, we can use '.' instead of '\u{1}'.
                            json_path_sep_to_dot(&mut json_path);
                            format!("{}.{}", field_name, json_path)
                        }
                    };
                    indexed_fields.extend(
                        encoded_fields_in_index
                            .into_iter()
                            .map(|(name, typ)| (build_path(&field_name, name), typ))
                            .map(|(field_name, typ)| FieldMetadata {
                                indexed: true,
                                stored: false,
                                field_name,
                                fast: false,
                                typ,
                            }),
                    );
                } else {
                    indexed_fields.push(FieldMetadata {
                        indexed: true,
                        stored: false,
                        field_name: field_name.to_string(),
                        fast: false,
                        typ: field_entry.field_type().value_type(),
                    });
                }
            }
        }
        let mut fast_fields: Vec<FieldMetadata> = self
            .fast_fields()
            .columnar()
            .iter_columns()?
            .map(|(mut field_name, handle)| {
                json_path_sep_to_dot(&mut field_name);
                // map to canonical path, to avoid similar but different entries.
                // Eventually we should just accept '.' seperated for all cases.
                let field_name = map_to_canonical
                    .get(&field_name)
                    .unwrap_or(&field_name)
                    .to_string();
                FieldMetadata {
                    indexed: false,
                    stored: false,
                    field_name,
                    fast: true,
                    typ: Type::from(handle.column_type()),
                }
            })
            .collect();
        // Since the type is encoded differently in the fast field and in the inverted index,
        // the order of the fields is not guaranteed to be the same. Therefore, we sort the fields.
        // If we are sure that the order is the same, we can remove this sort.
        indexed_fields.sort_unstable();
        fast_fields.sort_unstable();
        let merged = merge_field_meta_data(vec![indexed_fields, fast_fields], &self.schema);
        Ok(merged)
    }
    /// Returns the segment id
    pub fn segment_id(&self) -> SegmentId {
        self.segment_id
@@ -330,6 +432,65 @@ impl SegmentReader {
    }
 }
 #[derive(Clone, Debug, PartialEq, Eq, PartialOrd, Ord)]
 /// FieldMetadata
 pub struct FieldMetadata {
    /// The field name
    // Notice: Don't reorder the declaration of 1.field_name 2.typ, as it is used for ordering by
    // field_name then typ.
    pub field_name: String,
    /// The field type
    // Notice: Don't reorder the declaration of 1.field_name 2.typ, as it is used for ordering by
    // field_name then typ.
    pub typ: Type,
    /// Is the field indexed for search
    pub indexed: bool,
    /// Is the field stored in the doc store
    pub stored: bool,
    /// Is the field stored in the columnar storage
    pub fast: bool,
 }
 impl BitOrAssign for FieldMetadata {
    fn bitor_assign(&mut self, rhs: Self) {
        assert!(self.field_name == rhs.field_name);
        assert!(self.typ == rhs.typ);
        self.indexed |= rhs.indexed;
        self.stored |= rhs.stored;
        self.fast |= rhs.fast;
    }
 }
 // Maybe too slow for the high cardinality case
 fn is_field_stored(field_name: &str, schema: &Schema) -> bool {
    schema
        .find_field(field_name)
        .map(|(field, _path)| schema.get_field_entry(field).is_stored())
        .unwrap_or(false)
 }
 /// Helper to merge the field metadata from multiple segments.
 pub fn merge_field_meta_data(
    field_metadatas: Vec<Vec<FieldMetadata>>,
    schema: &Schema,
 ) -> Vec<FieldMetadata> {
    let mut merged_field_metadata = Vec::new();
    for (_key, mut group) in &field_metadatas
        .into_iter()
        .kmerge_by(|left, right| left < right)
        // TODO: Remove allocation
        .group_by(|el| (el.field_name.to_string(), el.typ))
    {
        let mut merged: FieldMetadata = group.next().unwrap();
        for el in group {
            merged |= el;
        }
        // Currently is_field_stored is maybe too slow for the high cardinality case
        merged.stored = is_field_stored(&merged.field_name, schema);
        merged_field_metadata.push(merged);
    }
    merged_field_metadata
 }
 fn intersect_alive_bitset(
    left_opt: Option<AliveBitSet>,
    right_opt: Option<AliveBitSet>,
@@ -353,9 +514,127 @@ impl fmt::Debug for SegmentReader {
 #[cfg(test)]
 mod test {
    use super::*;
    use crate::core::Index;
-    use crate::schema::{Schema, Term, STORED, TEXT};
+    use crate::schema::{Schema, SchemaBuilder, Term, STORED, TEXT};
-    use crate::{DocId, IndexWriter};
+    use crate::{DocId, FieldMetadata, IndexWriter};
    #[test]
    fn test_merge_field_meta_data_same() {
        let schema = SchemaBuilder::new().build();
        let field_metadata1 = FieldMetadata {
            field_name: "a".to_string(),
            typ: crate::schema::Type::Str,
            indexed: true,
            stored: false,
            fast: true,
        };
        let field_metadata2 = FieldMetadata {
            field_name: "a".to_string(),
            typ: crate::schema::Type::Str,
            indexed: true,
            stored: false,
            fast: true,
        };
        let res = merge_field_meta_data(
            vec![vec![field_metadata1.clone()], vec![field_metadata2]],
            &schema,
        );
        assert_eq!(res, vec![field_metadata1]);
    }
    #[test]
    fn test_merge_field_meta_data_different() {
        let schema = SchemaBuilder::new().build();
        let field_metadata1 = FieldMetadata {
            field_name: "a".to_string(),
            typ: crate::schema::Type::Str,
            indexed: false,
            stored: false,
            fast: true,
        };
        let field_metadata2 = FieldMetadata {
            field_name: "b".to_string(),
            typ: crate::schema::Type::Str,
            indexed: false,
            stored: false,
            fast: true,
        };
        let field_metadata3 = FieldMetadata {
            field_name: "a".to_string(),
            typ: crate::schema::Type::Str,
            indexed: true,
            stored: false,
            fast: false,
        };
        let res = merge_field_meta_data(
            vec![
                vec![field_metadata1.clone(), field_metadata2.clone()],
                vec![field_metadata3],
            ],
            &schema,
        );
        let field_metadata_expected1 = FieldMetadata {
            field_name: "a".to_string(),
            typ: crate::schema::Type::Str,
            indexed: true,
            stored: false,
            fast: true,
        };
        assert_eq!(res, vec![field_metadata_expected1, field_metadata2.clone()]);
    }
    #[test]
    fn test_merge_field_meta_data_merge() {
        use pretty_assertions::assert_eq;
        let get_meta_data = |name: &str, typ: Type| FieldMetadata {
            field_name: name.to_string(),
            typ,
            indexed: false,
            stored: false,
            fast: true,
        };
        let schema = SchemaBuilder::new().build();
        let mut metas = vec![get_meta_data("d", Type::Str), get_meta_data("e", Type::U64)];
        metas.sort();
        let res = merge_field_meta_data(vec![vec![get_meta_data("e", Type::Str)], metas], &schema);
        assert_eq!(
            res,
            vec![
                get_meta_data("d", Type::Str),
                get_meta_data("e", Type::Str),
                get_meta_data("e", Type::U64),
            ]
        );
    }
    #[test]
    fn test_merge_field_meta_data_bitxor() {
        let field_metadata1 = FieldMetadata {
            field_name: "a".to_string(),
            typ: crate::schema::Type::Str,
            indexed: false,
            stored: false,
            fast: true,
        };
        let field_metadata2 = FieldMetadata {
            field_name: "a".to_string(),
            typ: crate::schema::Type::Str,
            indexed: true,
            stored: false,
            fast: false,
        };
        let field_metadata_expected = FieldMetadata {
            field_name: "a".to_string(),
            typ: crate::schema::Type::Str,
            indexed: true,
            stored: false,
            fast: true,
        };
        let mut res1 = field_metadata1.clone();
        res1 |= field_metadata2.clone();
        let mut res2 = field_metadata2.clone();
        res2 |= field_metadata1;
        assert_eq!(res1, field_metadata_expected);
        assert_eq!(res2, field_metadata_expected);
    }
    #[test]
    fn test_num_alive() -> crate::Result<()> {
--- a/src/core/tests.rs
+++ b/src/core/tests.rs
@@ -1,12 +1,13 @@
 use crate::collector::Count;
 use crate::directory::{RamDirectory, WatchCallback};
-use crate::indexer::NoMergePolicy;
+use crate::indexer::{LogMergePolicy, NoMergePolicy};
 use crate::json_utils::JsonTermWriter;
 use crate::query::TermQuery;
-use crate::schema::{Field, IndexRecordOption, Schema, INDEXED, STRING, TEXT};
+use crate::schema::{Field, IndexRecordOption, Schema, Type, INDEXED, STRING, TEXT};
 use crate::tokenizer::TokenizerManager;
 use crate::{
-    Directory, Index, IndexBuilder, IndexReader, IndexSettings, IndexWriter, ReloadPolicy,
+    Directory, DocSet, Index, IndexBuilder, IndexReader, IndexSettings, IndexWriter, Postings,
-    SegmentId, TantivyDocument, Term,
+    ReloadPolicy, SegmentId, TantivyDocument, Term,
 };
 #[test]
@@ -344,3 +345,189 @@ fn test_merging_segment_update_docfreq() {
    let term_info = inv_index.get_term_info(&term).unwrap().unwrap();
    assert_eq!(term_info.doc_freq, 12);
 }
 // motivated by https://github.com/quickwit-oss/quickwit/issues/4130
 #[test]
 fn test_positions_merge_bug_non_text_json_vint() {
    let mut schema_builder = Schema::builder();
    let field = schema_builder.add_json_field("dynamic", TEXT);
    let schema = schema_builder.build();
    let index = Index::create_in_ram(schema.clone());
    let mut writer: IndexWriter = index.writer_for_tests().unwrap();
    let mut merge_policy = LogMergePolicy::default();
    merge_policy.set_min_num_segments(2);
    writer.set_merge_policy(Box::new(merge_policy));
    // Here a string would work.
    let doc_json = r#"{"tenant_id":75}"#;
    let vals = serde_json::from_str(doc_json).unwrap();
    let mut doc = TantivyDocument::default();
    doc.add_object(field, vals);
    writer.add_document(doc.clone()).unwrap();
    writer.commit().unwrap();
    writer.add_document(doc.clone()).unwrap();
    writer.commit().unwrap();
    writer.wait_merging_threads().unwrap();
    let reader = index.reader().unwrap();
    assert_eq!(reader.searcher().segment_readers().len(), 1);
 }
 // Same as above but with bitpacked blocks
 #[test]
 fn test_positions_merge_bug_non_text_json_bitpacked_block() {
    let mut schema_builder = Schema::builder();
    let field = schema_builder.add_json_field("dynamic", TEXT);
    let schema = schema_builder.build();
    let index = Index::create_in_ram(schema.clone());
    let mut writer: IndexWriter = index.writer_for_tests().unwrap();
    let mut merge_policy = LogMergePolicy::default();
    merge_policy.set_min_num_segments(2);
    writer.set_merge_policy(Box::new(merge_policy));
    // Here a string would work.
    let doc_json = r#"{"tenant_id":75}"#;
    let vals = serde_json::from_str(doc_json).unwrap();
    let mut doc = TantivyDocument::default();
    doc.add_object(field, vals);
    for _ in 0..128 {
        writer.add_document(doc.clone()).unwrap();
    }
    writer.commit().unwrap();
    writer.add_document(doc.clone()).unwrap();
    writer.commit().unwrap();
    writer.wait_merging_threads().unwrap();
    let reader = index.reader().unwrap();
    assert_eq!(reader.searcher().segment_readers().len(), 1);
 }
 #[test]
 fn test_non_text_json_term_freq() {
    let mut schema_builder = Schema::builder();
    let field = schema_builder.add_json_field("dynamic", TEXT);
    let schema = schema_builder.build();
    let index = Index::create_in_ram(schema.clone());
    let mut writer: IndexWriter = index.writer_for_tests().unwrap();
    // Here a string would work.
    let doc_json = r#"{"tenant_id":75}"#;
    let vals = serde_json::from_str(doc_json).unwrap();
    let mut doc = TantivyDocument::default();
    doc.add_object(field, vals);
    writer.add_document(doc.clone()).unwrap();
    writer.commit().unwrap();
    let reader = index.reader().unwrap();
    assert_eq!(reader.searcher().segment_readers().len(), 1);
    let searcher = reader.searcher();
    let segment_reader = searcher.segment_reader(0u32);
    let inv_idx = segment_reader.inverted_index(field).unwrap();
    let mut term = Term::with_type_and_field(Type::Json, field);
    let mut json_term_writer = JsonTermWriter::wrap(&mut term, false);
    json_term_writer.push_path_segment("tenant_id");
    json_term_writer.close_path_and_set_type(Type::U64);
    json_term_writer.set_fast_value(75u64);
    let postings = inv_idx
        .read_postings(
            json_term_writer.term(),
            IndexRecordOption::WithFreqsAndPositions,
        )
        .unwrap()
        .unwrap();
    assert_eq!(postings.doc(), 0);
    assert_eq!(postings.term_freq(), 1u32);
 }
 #[test]
 fn test_non_text_json_term_freq_bitpacked() {
    let mut schema_builder = Schema::builder();
    let field = schema_builder.add_json_field("dynamic", TEXT);
    let schema = schema_builder.build();
    let index = Index::create_in_ram(schema.clone());
    let mut writer: IndexWriter = index.writer_for_tests().unwrap();
    // Here a string would work.
    let doc_json = r#"{"tenant_id":75}"#;
    let vals = serde_json::from_str(doc_json).unwrap();
    let mut doc = TantivyDocument::default();
    doc.add_object(field, vals);
    let num_docs = 132;
    for _ in 0..num_docs {
        writer.add_document(doc.clone()).unwrap();
    }
    writer.commit().unwrap();
    let reader = index.reader().unwrap();
    assert_eq!(reader.searcher().segment_readers().len(), 1);
    let searcher = reader.searcher();
    let segment_reader = searcher.segment_reader(0u32);
    let inv_idx = segment_reader.inverted_index(field).unwrap();
    let mut term = Term::with_type_and_field(Type::Json, field);
    let mut json_term_writer = JsonTermWriter::wrap(&mut term, false);
    json_term_writer.push_path_segment("tenant_id");
    json_term_writer.close_path_and_set_type(Type::U64);
    json_term_writer.set_fast_value(75u64);
    let mut postings = inv_idx
        .read_postings(
            json_term_writer.term(),
            IndexRecordOption::WithFreqsAndPositions,
        )
        .unwrap()
        .unwrap();
    assert_eq!(postings.doc(), 0);
    assert_eq!(postings.term_freq(), 1u32);
    for i in 1..num_docs {
        assert_eq!(postings.advance(), i);
        assert_eq!(postings.term_freq(), 1u32);
    }
 }
 #[cfg(feature = "quickwit")]
 #[test]
 fn test_get_many_docs() -> crate::Result<()> {
    use futures::executor::block_on;
    use futures::stream::{FuturesUnordered, StreamExt};
    use crate::schema::{OwnedValue, STORED};
    use crate::{DocAddress, TantivyError};
    let mut schema_builder = Schema::builder();
    let num_field = schema_builder.add_u64_field("num", STORED);
    let schema = schema_builder.build();
    let index = Index::create_in_ram(schema);
    let mut index_writer: IndexWriter = index.writer_for_tests()?;
    index_writer.set_merge_policy(Box::new(NoMergePolicy));
    for i in 0..10u64 {
        let doc = doc!(num_field=>i);
        index_writer.add_document(doc)?;
    }
    index_writer.commit()?;
    let segment_ids = index.searchable_segment_ids()?;
    index_writer.merge(&segment_ids).wait().unwrap();
    let searcher = index.reader()?.searcher();
    assert_eq!(searcher.num_docs(), 10);
    let doc_addresses = (0..10).map(|i| DocAddress::new(0, i));
    let mut groups: FuturesUnordered<_> = searcher
        .docs_async::<TantivyDocument>(doc_addresses)?
        .collect();
    let mut doc_nums = Vec::new();
    block_on(async {
        while let Some(group) = groups.next().await {
            for (_doc_address, doc) in group? {
                let num_value = doc.get_first(num_field).unwrap();
                if let OwnedValue::U64(num) = num_value {
                    doc_nums.push(*num);
                } else {
                    panic!("Expected u64 value");
                }
            }
        }
        Ok::<_, TantivyError>(())
    })?;
    doc_nums.sort();
    assert_eq!(doc_nums, (0..10).collect::<Vec<u64>>());
    Ok(())
 }
--- a/src/fastfield/mod.rs
+++ b/src/fastfield/mod.rs
@@ -131,7 +131,7 @@ mod tests {
        }
        let file = directory.open_read(path).unwrap();
-        assert_eq!(file.len(), 105);
+        assert_eq!(file.len(), 80);
        let fast_field_readers = FastFieldReaders::open(file, SCHEMA.clone()).unwrap();
        let column = fast_field_readers
            .u64("field")
@@ -181,7 +181,7 @@ mod tests {
            write.terminate().unwrap();
        }
        let file = directory.open_read(path).unwrap();
-        assert_eq!(file.len(), 133);
+        assert_eq!(file.len(), 108);
        let fast_field_readers = FastFieldReaders::open(file, SCHEMA.clone()).unwrap();
        let col = fast_field_readers
            .u64("field")
@@ -214,7 +214,7 @@ mod tests {
            write.terminate().unwrap();
        }
        let file = directory.open_read(path).unwrap();
-        assert_eq!(file.len(), 106);
+        assert_eq!(file.len(), 81);
        let fast_field_readers = FastFieldReaders::open(file, SCHEMA.clone()).unwrap();
        let fast_field_reader = fast_field_readers
            .u64("field")
@@ -246,7 +246,7 @@ mod tests {
            write.terminate().unwrap();
        }
        let file = directory.open_read(path).unwrap();
-        assert_eq!(file.len(), 4501);
+        assert_eq!(file.len(), 4476);
        {
            let fast_field_readers = FastFieldReaders::open(file, SCHEMA.clone()).unwrap();
            let col = fast_field_readers
@@ -279,7 +279,7 @@ mod tests {
            write.terminate().unwrap();
        }
        let file = directory.open_read(path).unwrap();
-        assert_eq!(file.len(), 277);
+        assert_eq!(file.len(), 252);
        {
            let fast_field_readers = FastFieldReaders::open(file, schema).unwrap();
@@ -773,7 +773,7 @@ mod tests {
            write.terminate().unwrap();
        }
        let file = directory.open_read(path).unwrap();
-        assert_eq!(file.len(), 114);
+        assert_eq!(file.len(), 84);
        let fast_field_readers = FastFieldReaders::open(file, schema).unwrap();
        let bool_col = fast_field_readers.bool("field_bool").unwrap();
        assert_eq!(bool_col.first(0), Some(true));
@@ -805,7 +805,7 @@ mod tests {
            write.terminate().unwrap();
        }
        let file = directory.open_read(path).unwrap();
-        assert_eq!(file.len(), 126);
+        assert_eq!(file.len(), 96);
        let readers = FastFieldReaders::open(file, schema).unwrap();
        let bool_col = readers.bool("field_bool").unwrap();
        for i in 0..25 {
@@ -830,7 +830,7 @@ mod tests {
            write.terminate().unwrap();
        }
        let file = directory.open_read(path).unwrap();
-        assert_eq!(file.len(), 116);
+        assert_eq!(file.len(), 86);
        let fastfield_readers = FastFieldReaders::open(file, schema).unwrap();
        let col = fastfield_readers.bool("field_bool").unwrap();
        assert_eq!(col.first(0), None);
@@ -1288,11 +1288,18 @@ mod tests {
        index_writer.commit().unwrap();
        let searcher = index.reader().unwrap().searcher();
        let fast_field_reader = searcher.segment_reader(0u32).fast_fields();
        // Supported for now, maybe dropped in the future.
        let column = fast_field_reader
            .column_opt::<i64>("jsonfield.attr.age")
            .unwrap()
            .unwrap();
        let vals: Vec<i64> = column.values_for_doc(0u32).collect();
        assert_eq!(&vals, &[33]);
        let column = fast_field_reader
            .column_opt::<i64>("jsonfield\\.attr.age")
            .unwrap()
            .unwrap();
        let vals: Vec<i64> = column.values_for_doc(0u32).collect();
        assert_eq!(&vals, &[33]);
    }
 }
--- a/src/indexer/merger.rs
+++ b/src/indexer/merger.rs
@@ -552,7 +552,41 @@ impl IndexMerger {
                continue;
            }
-            field_serializer.new_term(term_bytes, total_doc_freq)?;
+            // This should never happen as we early exited for total_doc_freq == 0.
            assert!(!segment_postings_containing_the_term.is_empty());
            let has_term_freq = {
                let has_term_freq = !segment_postings_containing_the_term[0]
                    .1
                    .block_cursor
                    .freqs()
                    .is_empty();
                for (_, postings) in &segment_postings_containing_the_term[1..] {
                    // This may look at a strange way to test whether we have term freq or not.
                    // With JSON object, the schema is not sufficient to know whether a term
                    // has its term frequency encoded or not:
                    // strings may have term frequencies, while number terms never have one.
                    //
                    // Ideally, we should have burnt one bit of two in the `TermInfo`.
                    // However, we preferred not changing the codec too much and detect this
                    // instead by
                    // - looking at the size of the skip data for bitpacked blocks
                    // - observing the absence of remaining data after reading the docs for vint
                    // blocks.
                    //
                    // Overall the reliable way to know if we have actual frequencies loaded or not
                    // is to check whether the actual decoded array is empty or not.
                    if has_term_freq != !postings.block_cursor.freqs().is_empty() {
                        return Err(DataCorruption::comment_only(
                            "Term freqs are inconsistent across segments",
                        )
                        .into());
                    }
                }
                has_term_freq
            };
            field_serializer.new_term(term_bytes, total_doc_freq, has_term_freq)?;
            // We can now serialize this postings, by pushing each document to the
            // postings serializer.
@@ -567,8 +601,13 @@ impl IndexMerger {
                    if let Some(remapped_doc_id) = old_to_new_doc_id[doc as usize] {
                        // we make sure to only write the term if
                        // there is at least one document.
-                        let term_freq = segment_postings.term_freq();
+                        let term_freq = if has_term_freq {
-                        segment_postings.positions(&mut positions_buffer);
+                            segment_postings.positions(&mut positions_buffer);
                            segment_postings.term_freq()
                        } else {
                            0u32
                        };
                        // if doc_id_mapping exists, the doc_ids are reordered, they are
                        // not just stacked. The field serializer expects monotonically increasing
                        // doc_ids, so we collect and sort them first, before writing.
--- a/src/indexer/mod.rs
+++ b/src/indexer/mod.rs
@@ -59,10 +59,13 @@ type AddBatchReceiver<D> = channel::Receiver<AddBatch<D>>;
 #[cfg(test)]
 mod tests_mmap {
-    use crate::collector::Count;
+    use crate::aggregation::agg_req::Aggregations;
-    use crate::query::QueryParser;
+    use crate::aggregation::agg_result::AggregationResults;
-    use crate::schema::{JsonObjectOptions, Schema, Type, TEXT};
+    use crate::aggregation::AggregationCollector;
-    use crate::{Index, IndexWriter, Term};
+    use crate::collector::{Count, TopDocs};
    use crate::query::{AllQuery, QueryParser};
    use crate::schema::{JsonObjectOptions, Schema, Type, FAST, INDEXED, STORED, TEXT};
    use crate::{FieldMetadata, Index, IndexWriter, Term};
    #[test]
    fn test_advance_delete_bug() -> crate::Result<()> {
@@ -173,8 +176,7 @@ mod tests_mmap {
    #[test]
    fn test_json_field_list_fields() {
        let mut schema_builder = Schema::builder();
-        let json_options: JsonObjectOptions =
+        let json_options: JsonObjectOptions = JsonObjectOptions::from(TEXT);
            JsonObjectOptions::from(TEXT).set_expand_dots_enabled();
        let json_field = schema_builder.add_json_field("json", json_options);
        let index = Index::create_in_ram(schema_builder.build());
        let mut index_writer = index.writer_for_tests().unwrap();
@@ -193,9 +195,9 @@ mod tests_mmap {
        let reader = &searcher.segment_readers()[0];
        let inverted_index = reader.inverted_index(json_field).unwrap();
        assert_eq!(
-            inverted_index.list_fields().unwrap(),
+            inverted_index.list_encoded_fields().unwrap(),
            [
-                ("k8s\u{1}container\u{1}name".to_string(), Type::Str),
+                ("k8s.container.name".to_string(), Type::Str),
                ("sub\u{1}a".to_string(), Type::I64),
                ("sub\u{1}b".to_string(), Type::I64),
                ("suber\u{1}a".to_string(), Type::I64),
@@ -205,4 +207,240 @@ mod tests_mmap {
            ]
        );
    }
    #[test]
    fn test_json_fields_metadata_expanded_dots_one_segment() {
        test_json_fields_metadata(true, true);
    }
    #[test]
    fn test_json_fields_metadata_expanded_dots_multi_segment() {
        test_json_fields_metadata(true, false);
    }
    #[test]
    fn test_json_fields_metadata_no_expanded_dots_one_segment() {
        test_json_fields_metadata(false, true);
    }
    #[test]
    fn test_json_fields_metadata_no_expanded_dots_multi_segment() {
        test_json_fields_metadata(false, false);
    }
    fn test_json_fields_metadata(expanded_dots: bool, one_segment: bool) {
        use pretty_assertions::assert_eq;
        let mut schema_builder = Schema::builder();
        let json_options: JsonObjectOptions =
            JsonObjectOptions::from(TEXT).set_fast(None).set_stored();
        let json_options = if expanded_dots {
            json_options.set_expand_dots_enabled()
        } else {
            json_options
        };
        schema_builder.add_json_field("json.confusing", json_options.clone());
        let json_field = schema_builder.add_json_field("json.shadow", json_options.clone());
        let json_field2 = schema_builder.add_json_field("json", json_options.clone());
        schema_builder.add_json_field("empty_json", json_options);
        let number_field = schema_builder.add_u64_field("numbers", FAST);
        schema_builder.add_u64_field("empty", FAST | INDEXED | STORED);
        let index = Index::create_in_ram(schema_builder.build());
        let mut index_writer = index.writer_for_tests().unwrap();
        let json =
            serde_json::json!({"k8s.container.name": "a", "val": "a", "sub": {"a": 1, "b": 1}});
        index_writer.add_document(doc!(json_field=>json)).unwrap();
        let json =
            serde_json::json!({"k8s.container.name": "a", "val": "a", "suber": {"a": 1, "b": 1}});
        if !one_segment {
            index_writer.commit().unwrap();
        }
        index_writer.add_document(doc!(json_field=>json)).unwrap();
        let json = serde_json::json!({"k8s.container.name": "a", "k8s.container.name": "a", "val": "a", "suber": {"a": "a", "b": 1}});
        index_writer
            .add_document(doc!(number_field => 50u64, json_field=>json, json_field2=>json!({"shadow": {"val": "a"}})))
            .unwrap();
        index_writer.commit().unwrap();
        let reader = index.reader().unwrap();
        let searcher = reader.searcher();
        assert_eq!(searcher.num_docs(), 3);
        let fields_metadata = index.fields_metadata().unwrap();
        assert_eq!(
            fields_metadata,
            [
                FieldMetadata {
                    field_name: "empty".to_string(),
                    indexed: true,
                    stored: true,
                    fast: true,
                    typ: Type::U64
                },
                FieldMetadata {
                    field_name: if expanded_dots {
                        "json.shadow.k8s.container.name".to_string()
                    } else {
                        "json.shadow.k8s\\.container\\.name".to_string()
                    },
                    indexed: true,
                    stored: true,
                    fast: true,
                    typ: Type::Str
                },
                FieldMetadata {
                    field_name: "json.shadow.sub.a".to_string(),
                    indexed: true,
                    stored: true,
                    fast: true,
                    typ: Type::I64
                },
                FieldMetadata {
                    field_name: "json.shadow.sub.b".to_string(),
                    indexed: true,
                    stored: true,
                    fast: true,
                    typ: Type::I64
                },
                FieldMetadata {
                    field_name: "json.shadow.suber.a".to_string(),
                    indexed: true,
                    stored: true,
                    fast: true,
                    typ: Type::I64
                },
                FieldMetadata {
                    field_name: "json.shadow.suber.a".to_string(),
                    indexed: true,
                    stored: true,
                    fast: true,
                    typ: Type::Str
                },
                FieldMetadata {
                    field_name: "json.shadow.suber.b".to_string(),
                    indexed: true,
                    stored: true,
                    fast: true,
                    typ: Type::I64
                },
                FieldMetadata {
                    field_name: "json.shadow.val".to_string(),
                    indexed: true,
                    stored: true,
                    fast: true,
                    typ: Type::Str
                },
                FieldMetadata {
                    field_name: "numbers".to_string(),
                    indexed: false,
                    stored: false,
                    fast: true,
                    typ: Type::U64
                }
            ]
        );
        let query_parser = QueryParser::for_index(&index, vec![]);
        // Test if returned field name can be queried
        for indexed_field in fields_metadata.iter().filter(|meta| meta.indexed) {
            let val = if indexed_field.typ == Type::Str {
                "a"
            } else {
                "1"
            };
            let query_str = &format!("{}:{}", indexed_field.field_name, val);
            let query = query_parser.parse_query(query_str).unwrap();
            let count_docs = searcher.search(&*query, &TopDocs::with_limit(2)).unwrap();
            if indexed_field.field_name.contains("empty") || indexed_field.typ == Type::Json {
                assert_eq!(count_docs.len(), 0);
            } else {
                assert!(!count_docs.is_empty(), "{}", indexed_field.field_name);
            }
        }
        // Test if returned field name can be used for aggregation
        for fast_field in fields_metadata.iter().filter(|meta| meta.fast) {
            let agg_req_str = json!(
            {
              "termagg": {
                "terms": {
                  "field": fast_field.field_name,
                }
              }
            });
            let agg_req: Aggregations = serde_json::from_value(agg_req_str).unwrap();
            let collector = AggregationCollector::from_aggs(agg_req, Default::default());
            let agg_res: AggregationResults = searcher.search(&AllQuery, &collector).unwrap();
            let res = serde_json::to_value(agg_res).unwrap();
            if !fast_field.field_name.contains("empty") && fast_field.typ != Type::Json {
                assert!(
                    !res["termagg"]["buckets"].as_array().unwrap().is_empty(),
                    "{}",
                    fast_field.field_name
                );
            }
        }
    }
    #[test]
    fn test_json_field_shadowing_field_name_bug() {
        /// This test is only there to display a bug on addressing a field if it gets shadowed
        /// The issues only occurs if the field name that shadows contains a dot.
        ///
        /// Happens independently of the `expand_dots` option. Since that option does not
        /// affect the field name itself.
        use pretty_assertions::assert_eq;
        let mut schema_builder = Schema::builder();
        let json_options: JsonObjectOptions =
            JsonObjectOptions::from(TEXT).set_fast(None).set_stored();
        // let json_options = json_options.set_expand_dots_enabled();
        let json_field_shadow = schema_builder.add_json_field("json.shadow", json_options.clone());
        let json_field = schema_builder.add_json_field("json", json_options.clone());
        let index = Index::create_in_ram(schema_builder.build());
        let mut index_writer = index.writer_for_tests().unwrap();
        index_writer
            .add_document(
                doc!(json_field_shadow=>json!({"val": "b"}), json_field=>json!({"shadow": {"val": "a"}})),
            )
            .unwrap();
        index_writer.commit().unwrap();
        let reader = index.reader().unwrap();
        let searcher = reader.searcher();
        let fields_and_vals = vec![
            // Only way to address or it gets shadowed by `json.shadow` field
            ("json.shadow\u{1}val".to_string(), "a"), // Succeeds
            //("json.shadow.val".to_string(), "a"),   // Fails
            ("json.shadow.val".to_string(), "b"), // Succeeds
        ];
        let query_parser = QueryParser::for_index(&index, vec![]);
        // Test if field name can be queried
        for (indexed_field, val) in fields_and_vals.iter() {
            let query_str = &format!("{}:{}", indexed_field, val);
            let query = query_parser.parse_query(query_str).unwrap();
            let count_docs = searcher.search(&*query, &TopDocs::with_limit(2)).unwrap();
            assert!(!count_docs.is_empty(), "{}:{}", indexed_field, val);
        }
        // Test if field name can be used for aggregation
        for (field_name, val) in fields_and_vals.iter() {
            let agg_req_str = json!(
            {
              "termagg": {
                "terms": {
                  "field": field_name,
                }
              }
            });
            let agg_req: Aggregations = serde_json::from_value(agg_req_str).unwrap();
            let collector = AggregationCollector::from_aggs(agg_req, Default::default());
            let agg_res: AggregationResults = searcher.search(&AllQuery, &collector).unwrap();
            let res = serde_json::to_value(agg_res).unwrap();
            assert_eq!(
                res["termagg"]["buckets"].as_array().unwrap()[0]["key"]
                    .as_str()
                    .unwrap(),
                *val,
                "{}",
                field_name
            );
        }
    }
 }
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -221,9 +221,9 @@ pub use self::snippet::{Snippet, SnippetGenerator};
 #[doc(hidden)]
 pub use crate::core::json_utils;
 pub use crate::core::{
-    Executor, Index, IndexBuilder, IndexMeta, IndexSettings, IndexSortByField, InvertedIndexReader,
+    merge_field_meta_data, Executor, FieldMetadata, Index, IndexBuilder, IndexMeta, IndexSettings,
-    Order, Searcher, SearcherGeneration, Segment, SegmentComponent, SegmentId, SegmentMeta,
+    IndexSortByField, InvertedIndexReader, Order, Searcher, SearcherGeneration, Segment,
-    SegmentReader, SingleSegmentIndexWriter,
+    SegmentComponent, SegmentId, SegmentMeta, SegmentReader, SingleSegmentIndexWriter,
 };
 pub use crate::directory::Directory;
 pub use crate::indexer::IndexWriter;
--- a/src/postings/json_postings_writer.rs
+++ b/src/postings/json_postings_writer.rs
@@ -11,6 +11,10 @@ use crate::schema::{Field, Type, JSON_END_OF_PATH};
 use crate::tokenizer::TokenStream;
 use crate::{DocId, Term};
 /// The `JsonPostingsWriter` is odd in that it relies on a hidden contract:
 ///
 /// `subscribe` is called directly to index non-text tokens, while
 /// `index_text` is used to index text.
 #[derive(Default)]
 pub(crate) struct JsonPostingsWriter<Rec: Recorder> {
    str_posting_writer: SpecializedPostingsWriter<Rec>,
--- a/src/postings/mod.rs
+++ b/src/postings/mod.rs
@@ -63,7 +63,7 @@ pub mod tests {
        let mut segment = index.new_segment();
        let mut posting_serializer = InvertedIndexSerializer::open(&mut segment)?;
        let mut field_serializer = posting_serializer.new_field(text_field, 120 * 4, None)?;
-        field_serializer.new_term("abc".as_bytes(), 12u32)?;
+        field_serializer.new_term("abc".as_bytes(), 12u32, true)?;
        for doc_id in 0u32..120u32 {
            let delta_positions = vec![1, 2, 3, 2];
            field_serializer.write_doc(doc_id, 4, &delta_positions);
--- a/src/postings/postings_writer.rs
+++ b/src/postings/postings_writer.rs
@@ -194,7 +194,7 @@ impl<Rec: Recorder> SpecializedPostingsWriter<Rec> {
    ) -> io::Result<()> {
        let recorder: Rec = ctx.term_index.read(addr);
        let term_doc_freq = recorder.term_doc_freq().unwrap_or(0u32);
-        serializer.new_term(term, term_doc_freq)?;
+        serializer.new_term(term, term_doc_freq, recorder.has_term_freq())?;
        recorder.serialize(&ctx.arena, doc_id_map, serializer, buffer_lender);
        serializer.close_term()?;
        Ok(())
--- a/src/postings/recorder.rs
+++ b/src/postings/recorder.rs
@@ -79,24 +79,20 @@ pub(crate) trait Recorder: Copy + Default + Send + Sync + 'static {
    ///
    /// Returns `None` if not available.
    fn term_doc_freq(&self) -> Option<u32>;
    #[inline]
    fn has_term_freq(&self) -> bool {
        true
    }
 }
 /// Only records the doc ids
-#[derive(Clone, Copy)]
+#[derive(Clone, Copy, Default)]
 pub struct DocIdRecorder {
    stack: ExpUnrolledLinkedList,
    current_doc: DocId,
 }
 impl Default for DocIdRecorder {
    fn default() -> Self {
        DocIdRecorder {
            stack: ExpUnrolledLinkedList::default(),
            current_doc: u32::MAX,
        }
    }
 }
 impl Recorder for DocIdRecorder {
    #[inline]
    fn current_doc(&self) -> DocId {
@@ -105,8 +101,9 @@ impl Recorder for DocIdRecorder {
    #[inline]
    fn new_doc(&mut self, doc: DocId, arena: &mut MemoryArena) {
        let delta = doc - self.current_doc;
        self.current_doc = doc;
-        self.stack.writer(arena).write_u32_vint(doc);
+        self.stack.writer(arena).write_u32_vint(delta);
    }
    #[inline]
@@ -123,21 +120,20 @@ impl Recorder for DocIdRecorder {
        buffer_lender: &mut BufferLender,
    ) {
        let (buffer, doc_ids) = buffer_lender.lend_all();
        self.stack.read_to_end(arena, buffer);
        // TODO avoid reading twice.
        self.stack.read_to_end(arena, buffer);
        if let Some(doc_id_map) = doc_id_map {
-            doc_ids.extend(
+            let iter = get_sum_reader(VInt32Reader::new(&buffer[..]));
-                VInt32Reader::new(&buffer[..])
+            doc_ids.extend(iter.map(|old_doc_id| doc_id_map.get_new_doc_id(old_doc_id)));
                    .map(|old_doc_id| doc_id_map.get_new_doc_id(old_doc_id)),
            );
            doc_ids.sort_unstable();
            for doc in doc_ids {
                serializer.write_doc(*doc, 0u32, &[][..]);
            }
        } else {
-            for doc in VInt32Reader::new(&buffer[..]) {
+            let iter = get_sum_reader(VInt32Reader::new(&buffer[..]));
-                serializer.write_doc(doc, 0u32, &[][..]);
+            for doc_id in iter {
                serializer.write_doc(doc_id, 0u32, &[][..]);
            }
        }
    }
@@ -145,6 +141,19 @@ impl Recorder for DocIdRecorder {
    fn term_doc_freq(&self) -> Option<u32> {
        None
    }
    fn has_term_freq(&self) -> bool {
        false
    }
 }
 /// Takes an Iterator of delta encoded elements and returns an iterator
 /// that yields the sum of the elements.
 fn get_sum_reader(iter: impl Iterator<Item = u32>) -> impl Iterator<Item = u32> {
    iter.scan(0, |state, delta| {
        *state += delta;
        Some(*state)
    })
 }
 /// Recorder encoding document ids, and term frequencies
@@ -164,9 +173,10 @@ impl Recorder for TermFrequencyRecorder {
    #[inline]
    fn new_doc(&mut self, doc: DocId, arena: &mut MemoryArena) {
        let delta = doc - self.current_doc;
        self.term_doc_freq += 1;
        self.current_doc = doc;
-        self.stack.writer(arena).write_u32_vint(doc);
+        self.stack.writer(arena).write_u32_vint(delta);
    }
    #[inline]
@@ -193,9 +203,12 @@ impl Recorder for TermFrequencyRecorder {
        let mut u32_it = VInt32Reader::new(&buffer[..]);
        if let Some(doc_id_map) = doc_id_map {
            let mut doc_id_and_tf = vec![];
-            while let Some(old_doc_id) = u32_it.next() {
+            let mut prev_doc = 0;
            while let Some(delta_doc_id) = u32_it.next() {
                let doc_id = prev_doc + delta_doc_id;
                prev_doc = doc_id;
                let term_freq = u32_it.next().unwrap_or(self.current_tf);
-                doc_id_and_tf.push((doc_id_map.get_new_doc_id(old_doc_id), term_freq));
+                doc_id_and_tf.push((doc_id_map.get_new_doc_id(doc_id), term_freq));
            }
            doc_id_and_tf.sort_unstable_by_key(|&(doc_id, _)| doc_id);
@@ -203,9 +216,12 @@ impl Recorder for TermFrequencyRecorder {
                serializer.write_doc(doc_id, tf, &[][..]);
            }
        } else {
-            while let Some(doc) = u32_it.next() {
+            let mut prev_doc = 0;
            while let Some(delta_doc_id) = u32_it.next() {
                let doc_id = prev_doc + delta_doc_id;
                prev_doc = doc_id;
                let term_freq = u32_it.next().unwrap_or(self.current_tf);
-                serializer.write_doc(doc, term_freq, &[][..]);
+                serializer.write_doc(doc_id, term_freq, &[][..]);
            }
        }
    }
@@ -216,23 +232,13 @@ impl Recorder for TermFrequencyRecorder {
 }
 /// Recorder encoding term frequencies as well as positions.
-#[derive(Clone, Copy)]
+#[derive(Clone, Copy, Default)]
 pub struct TfAndPositionRecorder {
    stack: ExpUnrolledLinkedList,
    current_doc: DocId,
    term_doc_freq: u32,
 }
 impl Default for TfAndPositionRecorder {
    fn default() -> Self {
        TfAndPositionRecorder {
            stack: ExpUnrolledLinkedList::default(),
            current_doc: u32::MAX,
            term_doc_freq: 0u32,
        }
    }
 }
 impl Recorder for TfAndPositionRecorder {
    #[inline]
    fn current_doc(&self) -> DocId {
@@ -241,9 +247,10 @@ impl Recorder for TfAndPositionRecorder {
    #[inline]
    fn new_doc(&mut self, doc: DocId, arena: &mut MemoryArena) {
        let delta = doc - self.current_doc;
        self.current_doc = doc;
        self.term_doc_freq += 1u32;
-        self.stack.writer(arena).write_u32_vint(doc);
+        self.stack.writer(arena).write_u32_vint(delta);
    }
    #[inline]
@@ -269,7 +276,10 @@ impl Recorder for TfAndPositionRecorder {
        self.stack.read_to_end(arena, buffer_u8);
        let mut u32_it = VInt32Reader::new(&buffer_u8[..]);
        let mut doc_id_and_positions = vec![];
-        while let Some(doc) = u32_it.next() {
+        let mut prev_doc = 0;
        while let Some(delta_doc_id) = u32_it.next() {
            let doc_id = prev_doc + delta_doc_id;
            prev_doc = doc_id;
            let mut prev_position_plus_one = 1u32;
            buffer_positions.clear();
            loop {
@@ -287,9 +297,9 @@ impl Recorder for TfAndPositionRecorder {
            if let Some(doc_id_map) = doc_id_map {
                // this simple variant to remap may consume to much memory
                doc_id_and_positions
-                    .push((doc_id_map.get_new_doc_id(doc), buffer_positions.to_vec()));
+                    .push((doc_id_map.get_new_doc_id(doc_id), buffer_positions.to_vec()));
            } else {
-                serializer.write_doc(doc, buffer_positions.len() as u32, buffer_positions);
+                serializer.write_doc(doc_id, buffer_positions.len() as u32, buffer_positions);
            }
        }
        if doc_id_map.is_some() {
--- a/src/postings/segment_postings.rs
+++ b/src/postings/segment_postings.rs
@@ -71,7 +71,7 @@ impl SegmentPostings {
        {
            let mut postings_serializer =
                PostingsSerializer::new(&mut buffer, 0.0, IndexRecordOption::Basic, None);
-            postings_serializer.new_term(docs.len() as u32);
+            postings_serializer.new_term(docs.len() as u32, false);
            for &doc in docs {
                postings_serializer.write_doc(doc, 1u32);
            }
@@ -120,7 +120,7 @@ impl SegmentPostings {
            IndexRecordOption::WithFreqs,
            fieldnorm_reader,
        );
-        postings_serializer.new_term(doc_and_tfs.len() as u32);
+        postings_serializer.new_term(doc_and_tfs.len() as u32, true);
        for &(doc, tf) in doc_and_tfs {
            postings_serializer.write_doc(doc, tf);
        }
@@ -238,14 +238,18 @@ impl Postings for SegmentPostings {
    }
    fn positions_with_offset(&mut self, offset: u32, output: &mut Vec<u32>) {
-        let term_freq = self.term_freq() as usize;
+        let term_freq = self.term_freq();
        if let Some(position_reader) = self.position_reader.as_mut() {
            debug_assert!(
                !self.block_cursor.freqs().is_empty(),
                "No positions available"
            );
            let read_offset = self.block_cursor.position_offset()
                + (self.block_cursor.freqs()[..self.cur]
                    .iter()
                    .cloned()
                    .sum::<u32>() as u64);
-            output.resize(term_freq, 0u32);
+            output.resize(term_freq as usize, 0u32);
            position_reader.read(read_offset, &mut output[..]);
            let mut cum = offset;
            for output_mut in output.iter_mut() {
--- a/src/postings/serializer.rs
+++ b/src/postings/serializer.rs
@@ -168,7 +168,12 @@ impl<'a> FieldSerializer<'a> {
    /// * term - the term. It needs to come after the previous term according to the lexicographical
    ///   order.
    /// * term_doc_freq - return the number of document containing the term.
-    pub fn new_term(&mut self, term: &[u8], term_doc_freq: u32) -> io::Result<()> {
+    pub fn new_term(
        &mut self,
        term: &[u8],
        term_doc_freq: u32,
        record_term_freq: bool,
    ) -> io::Result<()> {
        assert!(
            !self.term_open,
            "Called new_term, while the previous term was not closed."
@@ -177,7 +182,8 @@ impl<'a> FieldSerializer<'a> {
        self.postings_serializer.clear();
        self.current_term_info = self.current_term_info();
        self.term_dictionary_builder.insert_key(term)?;
-        self.postings_serializer.new_term(term_doc_freq);
+        self.postings_serializer
            .new_term(term_doc_freq, record_term_freq);
        Ok(())
    }
@@ -330,10 +336,10 @@ impl<W: Write> PostingsSerializer<W> {
        }
    }
-    pub fn new_term(&mut self, term_doc_freq: u32) {
+    pub fn new_term(&mut self, term_doc_freq: u32, record_term_freq: bool) {
        self.bm25_weight = None;
-        self.term_has_freq = self.mode.has_freq() && term_doc_freq != 0;
+        self.term_has_freq = self.mode.has_freq() && record_term_freq;
        if !self.term_has_freq {
            return;
        }
@@ -349,7 +355,7 @@ impl<W: Write> PostingsSerializer<W> {
            return;
        }
-        self.bm25_weight = Some(Bm25Weight::for_one_term(
+        self.bm25_weight = Some(Bm25Weight::for_one_term_without_explain(
            term_doc_freq as u64,
            num_docs_in_segment,
            self.avg_fieldnorm,
--- a/src/query/bm25.rs
+++ b/src/query/bm25.rs
@@ -77,7 +77,7 @@ pub struct Bm25Params {
 /// A struct used for computing BM25 scores.
 #[derive(Clone)]
 pub struct Bm25Weight {
-    idf_explain: Explanation,
+    idf_explain: Option<Explanation>,
    weight: Score,
    cache: [Score; 256],
    average_fieldnorm: Score,
@@ -147,11 +147,30 @@ impl Bm25Weight {
        idf_explain.add_const("N, total number of docs", total_num_docs as Score);
        Bm25Weight::new(idf_explain, avg_fieldnorm)
    }
    /// Construct a [Bm25Weight] for a single term.
    /// This method does not carry the [Explanation] for the idf.
    pub fn for_one_term_without_explain(
        term_doc_freq: u64,
        total_num_docs: u64,
        avg_fieldnorm: Score,
    ) -> Bm25Weight {
        let idf = idf(term_doc_freq, total_num_docs);
        Bm25Weight::new_without_explain(idf, avg_fieldnorm)
    }
    pub(crate) fn new(idf_explain: Explanation, average_fieldnorm: Score) -> Bm25Weight {
        let weight = idf_explain.value() * (1.0 + K1);
        Bm25Weight {
-            idf_explain,
+            idf_explain: Some(idf_explain),
            weight,
            cache: compute_tf_cache(average_fieldnorm),
            average_fieldnorm,
        }
    }
    pub(crate) fn new_without_explain(idf: f32, average_fieldnorm: Score) -> Bm25Weight {
        let weight = idf * (1.0 + K1);
        Bm25Weight {
            idf_explain: None,
            weight,
            cache: compute_tf_cache(average_fieldnorm),
            average_fieldnorm,
@@ -202,7 +221,9 @@ impl Bm25Weight {
        let mut explanation = Explanation::new("TermQuery, product of...", score);
        explanation.add_detail(Explanation::new("(K1+1)", K1 + 1.0));
-        explanation.add_detail(self.idf_explain.clone());
+        if let Some(idf_explain) = &self.idf_explain {
            explanation.add_detail(idf_explain.clone());
        }
        explanation.add_detail(tf_explanation);
        explanation
    }
--- a/src/query/boost_query.rs
+++ b/src/query/boost_query.rs
@@ -74,7 +74,8 @@ impl Weight for BoostWeight {
    fn explain(&self, reader: &SegmentReader, doc: u32) -> crate::Result<Explanation> {
        let underlying_explanation = self.weight.explain(reader, doc)?;
        let score = underlying_explanation.value() * self.boost;
-        let mut explanation = Explanation::new(format!("Boost x{} of ...", self.boost), score);
+        let mut explanation =
            Explanation::new_with_string(format!("Boost x{} of ...", self.boost), score);
        explanation.add_detail(underlying_explanation);
        Ok(explanation)
    }
@@ -151,7 +152,7 @@ mod tests {
        let explanation = query.explain(&searcher, DocAddress::new(0, 0u32)).unwrap();
        assert_eq!(
            explanation.to_pretty_json(),
-            "{\n  \"value\": 0.2,\n  \"description\": \"Boost x0.2 of ...\",\n  \"details\": [\n    {\n      \"value\": 1.0,\n      \"description\": \"AllQuery\",\n      \"context\": []\n    }\n  ],\n  \"context\": []\n}"
+            "{\n  \"value\": 0.2,\n  \"description\": \"Boost x0.2 of ...\",\n  \"details\": [\n    {\n      \"value\": 1.0,\n      \"description\": \"AllQuery\"\n    }\n  ]\n}"
        );
        Ok(())
    }
--- a/src/query/const_score_query.rs
+++ b/src/query/const_score_query.rs
@@ -164,11 +164,9 @@ mod tests {
  "details": [
    {
      "value": 1.0,
-      "description": "AllQuery",
+      "description": "AllQuery"
      "context": []
    }
-  ],
+  ]
  "context": []
 }"#
        );
        Ok(())
--- a/src/query/explanation.rs
+++ b/src/query/explanation.rs
@@ -1,3 +1,4 @@
 use std::borrow::Cow;
 use std::fmt;
 use serde::Serialize;
@@ -16,12 +17,12 @@ pub(crate) fn does_not_match(doc: DocId) -> TantivyError {
 #[derive(Clone, Serialize)]
 pub struct Explanation {
    value: Score,
-    description: String,
+    description: Cow<'static, str>,
-    #[serde(skip_serializing_if = "Vec::is_empty")]
+    #[serde(skip_serializing_if = "Option::is_none")]
-    details: Vec<Explanation>,
+    details: Option<Vec<Explanation>>,
-    context: Vec<String>,
+    #[serde(skip_serializing_if = "Option::is_none")]
    context: Option<Vec<String>>,
 }
 impl fmt::Debug for Explanation {
    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
        write!(f, "Explanation({})", self.to_pretty_json())
@@ -30,12 +31,21 @@ impl fmt::Debug for Explanation {
 impl Explanation {
    /// Creates a new explanation object.
-    pub fn new<T: ToString>(description: T, value: Score) -> Explanation {
+    pub fn new_with_string(description: String, value: Score) -> Explanation {
        Explanation {
            value,
-            description: description.to_string(),
+            description: Cow::Owned(description),
-            details: vec![],
+            details: None,
-            context: vec![],
+            context: None,
        }
    }
    /// Creates a new explanation object.
    pub fn new(description: &'static str, value: Score) -> Explanation {
        Explanation {
            value,
            description: Cow::Borrowed(description),
            details: None,
            context: None,
        }
    }
@@ -48,17 +58,21 @@ impl Explanation {
    ///
    /// Details are treated as child of the current node.
    pub fn add_detail(&mut self, child_explanation: Explanation) {
-        self.details.push(child_explanation);
+        self.details
            .get_or_insert_with(Vec::new)
            .push(child_explanation);
    }
    /// Adds some extra context to the explanation.
    pub fn add_context(&mut self, context: String) {
-        self.context.push(context);
+        self.context.get_or_insert_with(Vec::new).push(context);
    }
    /// Shortcut for `self.details.push(Explanation::new(name, value));`
-    pub fn add_const<T: ToString>(&mut self, name: T, value: Score) {
+    pub fn add_const(&mut self, name: &'static str, value: Score) {
-        self.details.push(Explanation::new(name, value));
+        self.details
            .get_or_insert_with(Vec::new)
            .push(Explanation::new(name, value));
    }
    /// Returns an indented json representation of the explanation tree for debug usage.
--- a/src/query/term_query/term_query.rs
+++ b/src/query/term_query/term_query.rs
@@ -101,7 +101,7 @@ impl TermQuery {
                ..
            } => Bm25Weight::for_terms(statistics_provider, &[self.term.clone()])?,
            EnableScoring::Disabled { .. } => {
-                Bm25Weight::new(Explanation::new("<no score>".to_string(), 1.0f32), 1.0f32)
+                Bm25Weight::new(Explanation::new("<no score>", 1.0f32), 1.0f32)
            }
        };
        let scoring_enabled = enable_scoring.is_scoring_enabled();
--- a/src/schema/bytes_options.rs
+++ b/src/schema/bytes_options.rs
@@ -40,21 +40,25 @@ impl From<BytesOptionsDeser> for BytesOptions {
 impl BytesOptions {
    /// Returns true if the value is indexed.
    #[inline]
    pub fn is_indexed(&self) -> bool {
        self.indexed
    }
    /// Returns true if and only if the value is normed.
    #[inline]
    pub fn fieldnorms(&self) -> bool {
        self.fieldnorms
    }
    /// Returns true if the value is a fast field.
    #[inline]
    pub fn is_fast(&self) -> bool {
        self.fast
    }
    /// Returns true if the value is stored.
    #[inline]
    pub fn is_stored(&self) -> bool {
        self.stored
    }
--- a/src/schema/date_time_options.rs
+++ b/src/schema/date_time_options.rs
@@ -27,21 +27,25 @@ pub struct DateOptions {
 impl DateOptions {
    /// Returns true iff the value is stored.
    #[inline]
    pub fn is_stored(&self) -> bool {
        self.stored
    }
    /// Returns true iff the value is indexed and therefore searchable.
    #[inline]
    pub fn is_indexed(&self) -> bool {
        self.indexed
    }
    /// Returns true iff the field has fieldnorm.
    #[inline]
    pub fn fieldnorms(&self) -> bool {
        self.fieldnorms && self.indexed
    }
    /// Returns true iff the value is a fast field.
    #[inline]
    pub fn is_fast(&self) -> bool {
        self.fast
    }
--- a/src/schema/facet.rs
+++ b/src/schema/facet.rs
@@ -131,16 +131,16 @@ impl Facet {
    pub fn from_path<Path>(path: Path) -> Facet
    where
        Path: IntoIterator,
-        Path::Item: ToString,
+        Path::Item: AsRef<str>,
    {
        let mut facet_string: String = String::with_capacity(100);
        let mut step_it = path.into_iter();
        if let Some(step) = step_it.next() {
-            facet_string.push_str(&step.to_string());
+            facet_string.push_str(step.as_ref());
        }
        for step in step_it {
            facet_string.push(FACET_SEP_CHAR);
-            facet_string.push_str(&step.to_string());
+            facet_string.push_str(step.as_ref());
        }
        Facet(facet_string)
    }
--- a/src/schema/facet_options.rs
+++ b/src/schema/facet_options.rs
@@ -14,6 +14,7 @@ pub struct FacetOptions {
 impl FacetOptions {
    /// Returns true if the value is stored.
    #[inline]
    pub fn is_stored(&self) -> bool {
        self.stored
    }
--- a/src/schema/field_entry.rs
+++ b/src/schema/field_entry.rs
@@ -108,7 +108,16 @@ impl FieldEntry {
        self.field_type.is_fast()
    }
    /// Returns true if the field has the expand dots option set (for json fields)
    pub fn is_expand_dots_enabled(&self) -> bool {
        match self.field_type {
            FieldType::JsonObject(ref options) => options.is_expand_dots_enabled(),
            _ => false,
        }
    }
    /// Returns true if the field is stored
    #[inline]
    pub fn is_stored(&self) -> bool {
        match self.field_type {
            FieldType::U64(ref options)
--- a/src/schema/field_type.rs
+++ b/src/schema/field_type.rs
@@ -3,6 +3,7 @@ use std::str::FromStr;
 use base64::engine::general_purpose::STANDARD as BASE64;
 use base64::Engine;
 use columnar::ColumnType;
 use serde::{Deserialize, Serialize};
 use serde_json::Value as JsonValue;
 use thiserror::Error;
@@ -47,7 +48,7 @@ pub enum ValueParsingError {
 ///
 /// Contrary to FieldType, this does
 /// not include the way the field must be indexed.
-#[derive(Copy, Clone, Debug, Eq, PartialEq)]
+#[derive(Copy, Clone, Debug, Eq, PartialEq, Ord, PartialOrd, Hash)]
 #[repr(u8)]
 pub enum Type {
    /// `&str`
@@ -72,6 +73,21 @@ pub enum Type {
    IpAddr = b'p',
 }
 impl From<ColumnType> for Type {
    fn from(value: ColumnType) -> Self {
        match value {
            ColumnType::Str => Type::Str,
            ColumnType::U64 => Type::U64,
            ColumnType::I64 => Type::I64,
            ColumnType::F64 => Type::F64,
            ColumnType::Bool => Type::Bool,
            ColumnType::DateTime => Type::Date,
            ColumnType::Bytes => Type::Bytes,
            ColumnType::IpAddr => Type::IpAddr,
        }
    }
 }
 const ALL_TYPES: [Type; 10] = [
    Type::Str,
    Type::U64,
--- a/src/schema/ip_options.rs
+++ b/src/schema/ip_options.rs
@@ -31,21 +31,25 @@ pub struct IpAddrOptions {
 impl IpAddrOptions {
    /// Returns true iff the value is a fast field.
    #[inline]
    pub fn is_fast(&self) -> bool {
        self.fast
    }
    /// Returns `true` if the ip address should be stored in the doc store.
    #[inline]
    pub fn is_stored(&self) -> bool {
        self.stored
    }
    /// Returns true iff the value is indexed and therefore searchable.
    #[inline]
    pub fn is_indexed(&self) -> bool {
        self.indexed
    }
    /// Returns true if and only if the value is normed.
    #[inline]
    pub fn fieldnorms(&self) -> bool {
        self.fieldnorms
    }
--- a/src/schema/json_object_options.rs
+++ b/src/schema/json_object_options.rs
@@ -46,17 +46,20 @@ pub struct JsonObjectOptions {
 impl JsonObjectOptions {
    /// Returns `true` if the json object should be stored.
    #[inline]
    pub fn is_stored(&self) -> bool {
        self.stored
    }
    /// Returns `true` iff the json object should be indexed.
    #[inline]
    pub fn is_indexed(&self) -> bool {
        self.indexing.is_some()
    }
    /// Returns true if and only if the json object fields are
    /// to be treated as fast fields.
    #[inline]
    pub fn is_fast(&self) -> bool {
        matches!(self.fast, FastFieldTextOptions::IsEnabled(true))
            || matches!(
@@ -66,6 +69,7 @@ impl JsonObjectOptions {
    }
    /// Returns true if and only if the value is a fast field.
    #[inline]
    pub fn get_fast_field_tokenizer_name(&self) -> Option<&str> {
        match &self.fast {
            FastFieldTextOptions::IsEnabled(true) | FastFieldTextOptions::IsEnabled(false) => None,
@@ -87,6 +91,7 @@ impl JsonObjectOptions {
    ///
    /// If disabled, the "." needs to be escaped:
    /// `k8s\.node\.id:5`.
    #[inline]
    pub fn is_expand_dots_enabled(&self) -> bool {
        self.expand_dots_enabled
    }
@@ -103,6 +108,7 @@ impl JsonObjectOptions {
    /// If set to `Some` then both int and str values will be indexed.
    /// The inner `TextFieldIndexing` will however, only apply to the str values
    /// in the json object.
    #[inline]
    pub fn get_text_indexing_options(&self) -> Option<&TextFieldIndexing> {
        self.indexing.as_ref()
    }
--- a/src/schema/numeric_options.rs
+++ b/src/schema/numeric_options.rs
@@ -57,26 +57,31 @@ impl From<NumericOptionsDeser> for NumericOptions {
 impl NumericOptions {
    /// Returns true iff the value is stored in the doc store.
    #[inline]
    pub fn is_stored(&self) -> bool {
        self.stored
    }
    /// Returns true iff the value is indexed and therefore searchable.
    #[inline]
    pub fn is_indexed(&self) -> bool {
        self.indexed
    }
    /// Returns true iff the field has fieldnorm.
    #[inline]
    pub fn fieldnorms(&self) -> bool {
        self.fieldnorms && self.indexed
    }
    /// Returns true iff the value is a fast field.
    #[inline]
    pub fn is_fast(&self) -> bool {
        self.fast
    }
    /// Returns true if values should be coerced to numbers.
    #[inline]
    pub fn should_coerce(&self) -> bool {
        self.coerce
    }
--- a/src/schema/schema.rs
+++ b/src/schema/schema.rs
@@ -8,6 +8,7 @@ use serde::{Deserialize, Deserializer, Serialize, Serializer};
 use super::ip_options::IpAddrOptions;
 use super::*;
 use crate::json_utils::split_json_path;
 use crate::schema::bytes_options::BytesOptions;
 use crate::TantivyError;
@@ -30,7 +31,7 @@ use crate::TantivyError;
 /// let body_field = schema_builder.add_text_field("body", TEXT);
 /// let schema = schema_builder.build();
 /// ```
-#[derive(Default)]
+#[derive(Debug, Default)]
 pub struct SchemaBuilder {
    fields: Vec<FieldEntry>,
    fields_map: HashMap<String, Field>,
@@ -328,12 +329,19 @@ impl Schema {
        if let Some(field) = self.0.fields_map.get(full_path) {
            return Some((*field, ""));
        }
        let mut splitting_period_pos: Vec<usize> = locate_splitting_dots(full_path);
        while let Some(pos) = splitting_period_pos.pop() {
            let (prefix, suffix) = full_path.split_at(pos);
            if let Some(field) = self.0.fields_map.get(prefix) {
                return Some((*field, &suffix[1..]));
            }
            // JSON path may contain a dot, for now we try both variants to find the field.
            let prefix = split_json_path(prefix).join(".");
            if let Some(field) = self.0.fields_map.get(&prefix) {
                return Some((*field, &suffix[1..]));
            }
        }
        None
    }
@@ -349,6 +357,7 @@ impl Schema {
    pub fn find_field_with_default<'a>(
        &self,
        full_path: &'a str,
        default_field_opt: Option<Field>,
    ) -> Option<(Field, &'a str)> {
        let (field, json_path) = self
--- a/src/schema/text_options.rs
+++ b/src/schema/text_options.rs
@@ -72,16 +72,19 @@ fn is_false(val: &bool) -> bool {
 impl TextOptions {
    /// Returns the indexing options.
    #[inline]
    pub fn get_indexing_options(&self) -> Option<&TextFieldIndexing> {
        self.indexing.as_ref()
    }
    /// Returns true if the text is to be stored.
    #[inline]
    pub fn is_stored(&self) -> bool {
        self.stored
    }
    /// Returns true if and only if the value is a fast field.
    #[inline]
    pub fn is_fast(&self) -> bool {
        matches!(self.fast, FastFieldTextOptions::IsEnabled(true))
            || matches!(
@@ -91,6 +94,7 @@ impl TextOptions {
    }
    /// Returns true if and only if the value is a fast field.
    #[inline]
    pub fn get_fast_field_tokenizer_name(&self) -> Option<&str> {
        match &self.fast {
            FastFieldTextOptions::IsEnabled(true) | FastFieldTextOptions::IsEnabled(false) => None,
@@ -101,6 +105,7 @@ impl TextOptions {
    }
    /// Returns true if values should be coerced to strings (numbers, null).
    #[inline]
    pub fn should_coerce(&self) -> bool {
        self.coerce
    }
--- a/src/store/mod.rs
+++ b/src/store/mod.rs
@@ -37,6 +37,8 @@ mod reader;
 mod writer;
 pub use self::compressors::{Compressor, ZstdCompressor};
 pub use self::decompressors::Decompressor;
 #[cfg(feature = "quickwit")]
 pub(crate) use self::reader::CacheKey;
 pub(crate) use self::reader::DOCSTORE_CACHE_CAPACITY;
 pub use self::reader::{CacheStats, StoreReader};
 pub use self::writer::StoreWriter;
--- a/src/store/reader.rs
+++ b/src/store/reader.rs
@@ -40,6 +40,15 @@ struct BlockCache {
 }
 impl BlockCache {
    fn new(cache_num_blocks: usize) -> Self {
        Self {
            cache: NonZeroUsize::new(cache_num_blocks)
                .map(|cache_num_blocks| Mutex::new(LruCache::new(cache_num_blocks))),
            cache_hits: Default::default(),
            cache_misses: Default::default(),
        }
    }
    fn get_from_cache(&self, pos: usize) -> Option<Block> {
        if let Some(block) = self
            .cache
@@ -81,6 +90,10 @@ impl BlockCache {
    }
 }
 /// Opaque cache key which indicates which documents are cached together.
 #[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Hash)]
 pub(crate) struct CacheKey(usize);
 #[derive(Debug, Default)]
 /// CacheStats for the `StoreReader`.
 pub struct CacheStats {
@@ -128,17 +141,35 @@ impl StoreReader {
        Ok(StoreReader {
            decompressor: footer.decompressor,
            data: data_file,
-            cache: BlockCache {
+            cache: BlockCache::new(cache_num_blocks),
                cache: NonZeroUsize::new(cache_num_blocks)
                    .map(|cache_num_blocks| Mutex::new(LruCache::new(cache_num_blocks))),
                cache_hits: Default::default(),
                cache_misses: Default::default(),
            },
            skip_index: Arc::new(skip_index),
            space_usage,
        })
    }
    /// Clones the given store reader with an independent block cache of the given size.
    ///
    /// `cache_keys` is used to seed the forked cache from the current cache
    /// if some blocks are already available.
    #[cfg(feature = "quickwit")]
    pub(crate) fn fork_cache(&self, cache_num_blocks: usize, cache_keys: &[CacheKey]) -> Self {
        let forked = Self {
            decompressor: self.decompressor,
            data: self.data.clone(),
            cache: BlockCache::new(cache_num_blocks),
            skip_index: Arc::clone(&self.skip_index),
            space_usage: self.space_usage.clone(),
        };
        for &CacheKey(pos) in cache_keys {
            if let Some(block) = self.cache.get_from_cache(pos) {
                forked.cache.put_into_cache(pos, block);
            }
        }
        forked
    }
    pub(crate) fn block_checkpoints(&self) -> impl Iterator<Item = Checkpoint> + '_ {
        self.skip_index.checkpoints()
    }
@@ -152,6 +183,21 @@ impl StoreReader {
        self.cache.stats()
    }
    /// Returns the cache key for a given document
    ///
    /// These keys are opaque and are not used with the public API,
    /// but having the same cache key means that the documents
    /// will only require one I/O and decompression operation
    /// when retrieve from the same store reader consecutively.
    ///
    /// Note that looking up the cache key of a document
    /// will not yet pull anything into the block cache.
    #[cfg(feature = "quickwit")]
    pub(crate) fn cache_key(&self, doc_id: DocId) -> crate::Result<CacheKey> {
        let checkpoint = self.block_checkpoint(doc_id)?;
        Ok(CacheKey(checkpoint.byte_range.start))
    }
    /// Get checkpoint for `DocId`. The checkpoint can be used to load a block containing the
    /// document.
    ///
--- a/sstable/Cargo.toml
+++ b/sstable/Cargo.toml
@@ -11,16 +11,22 @@ description = "sstables for tantivy"
 [dependencies]
 common = {version= "0.6", path="../common", package="tantivy-common"}
-tantivy-fst = "0.4"
+tantivy-bitpacker = { version= "0.5", path="../bitpacker" }
 tantivy-fst = "0.5"
 # experimental gives us access to Decompressor::upper_bound
 zstd = { version = "0.13", features = ["experimental"] }
 [dev-dependencies]
 proptest = "1"
-criterion = "0.5"
+criterion = { version = "0.5", default-features = false }
 names = "0.14"
 rand = "0.8"
 [[bench]]
 name = "stream_bench"
 harness = false
 [[bench]]
 name = "ord_to_term"
 harness = false
--- a/sstable/README.md
+++ b/sstable/README.md
@@ -89,39 +89,71 @@ Note: as the SSTable does not support redundant keys, there is no ambiguity betw
 ### SSTFooter
 ```
-+-------+-------+-----+------------------+------------+-------------+---------+---------+
+-----+----------------+-------------+-------------+---------+---------+
-| Block | Block | ... | FirstLayerOffset | LayerCount | IndexOffset | NumTerm | Version |
+| Fst | BlockAddrStore | StoreOffset | IndexOffset | NumTerm | Version |
-+-------+-------+-----+------------------+------------+-------------+---------+---------+
+-----+----------------+-------------+-------------+---------+---------+
 |----(# of blocks)----|
 ```
- Block(SSTBlock): uses IndexValue for its Values format
+- Fst(Fst): finite state transducer mapping keys to a block number
- FirstLayerOffset(u64): Offset between the start of the footer and the start of the top level index
+- BlockAddrStore(BlockAddrStore): store mapping a block number to its BlockAddr
- LayerCount(u32): Number of layers of index (min 1) ## TODO do we want to use 0 as a marker for no layers? It makes small sstables 12 bytes more compact (the 0u32 would alias with the "end of sstable marker")
+- StoreOffset(u64): Offset to start of the BlockAddrStore. If zero, see the SingleBlockSStable section
 - IndexOffset(u64): Offset to the start of the SSTFooter
 - NumTerm(u64): number of terms in the sstable
 - Version(u32): Currently equal to 3
-Blocks referencing the main table and block referencing the index itself are encoded the same way and
+### Fst
 are not directly differentiated. Offsets in blocks referencing the index are relative to the start of
 the footer, blocks referencing the main table are relative to the start of that table.
-### IndexValue
+Fst is in the format of tantivy\_fst
 ```
 +------------+----------+-------+-------+-----+
 | EntryCount | StartPos | Entry | Entry | ... |
 +------------+----------+-------+-------+-----+
                        |---( # of entries)---|
 ```
- EntryCount(VInt): number of entries
+### BlockAddrStore
 - StartPos(VInt): the start pos of the first (data) block referenced by this (index) block
 - Entry (IndexEntry)
-### Entry
+---------+-----------+-----------+-----+-----------+-----------+-----+
-```
+| MetaLen | BlockMeta | BlockMeta | ... | BlockData | BlockData | ... |
-+----------+--------------+
+---------+-----------+-----------+-----+-----------+-----------+-----+
-| BlockLen | FirstOrdinal |
+          |---------(N blocks)----------|---------(N blocks)----------|
-+----------+--------------+
+
-```
+- MetaLen(u64): length of the BlockMeta section
- BlockLen(VInt): length of the block
+- BlockMeta(BlockAddrBlockMetadata): metadata to seek through BlockData
- FirstOrdinal(VInt): ordinal of the first element in the given block
+- BlockData(CompactedBlockAddr): bitpacked per block metadata
 ### BlockAddrBlockMetadata
 +--------+------------+--------------+------------+--------------+-------------------+-----------------+----------+
 | Offset | RangeStart | FirstOrdinal | RangeSlope | OrdinalSlope | FirstOrdinalNBits | RangeStartNBits | BlockLen |
 +--------+------------+--------------+------------+--------------+-------------------+-----------------+----------+
 - Offset(u64): offset of the corresponding BlockData in the datastream
 - RangeStart(u64): the start position of the first block
 - FirstOrdinal(u64): the first ordinal of the first block
 - RangeSlope(u32): slope predicted for start range evolution (see computation in BlockData)
 - OrdinalSlope(u64): slope predicted for first ordinal evolution (see computation in BlockData)
 - FirstOrdinalNBits(u8): number of bits per ordinal in datastream (see computation in BlockData)
 - RangeStartNBits(u8): number of bits per range start in datastream (see computation in BlockData)
 ### BlockData
 +-----------------+-------------------+---------------+
 | RangeStartDelta | FirstOrdinalDelta | FinalRangeEnd |
 +-----------------+-------------------+---------------+
 |------(BlockLen repetitions)---------|
 - RangeStartDelta(var): RangeStartNBits *bits* of little endian number. See below for decoding
 - FirstOrdinalDelta(var): FirstOrdinalNBits *bits* of little endian number. See below for decoding
 - FinalRangeEnd(var): RangeStartNBits *bits* of integer. See below for decoding
 converting a BlockData of index Index and a BlockAddrBlockMetadata to an actual block address is done as follow:
 range\_prediction := RangeStart + Index * RangeSlop;
 range\_derivation := RangeStartDelta - (1 << (RangeStartNBits-1));
 range\_start := range\_prediction + range\_derivation
 The same computation can be done for ordinal.
 Note that `range_derivation` can take negative value. `RangeStartDelta` is just its translation to a positive range.
 ## SingleBlockSStable
 The format used for the index is meant to be compact, however it has a constant cost of around 70
 bytes, which isn't negligible for a table containing very few keys.
 To limit the impact of that constant cost, single block sstable omit the Fst and BlockAddrStore from
 their index. Instead a block with first ordinal of 0, range start of 0 and range end of IndexOffset
 is implicitly used for every operations.
--- a/sstable/benches/ord_to_term.rs
+++ b/sstable/benches/ord_to_term.rs
@@ -0,0 +1,110 @@
 use std::sync::Arc;
 use common::file_slice::FileSlice;
 use common::OwnedBytes;
 use criterion::{criterion_group, criterion_main, Criterion};
 use tantivy_sstable::{self, Dictionary, MonotonicU64SSTable};
 fn make_test_sstable(suffix: &str) -> FileSlice {
    let mut builder = Dictionary::<MonotonicU64SSTable>::builder(Vec::new()).unwrap();
    // 125 mio elements
    for elem in 0..125_000_000 {
        let key = format!("prefix.{elem:07X}{suffix}").into_bytes();
        builder.insert(&key, &elem).unwrap();
    }
    let table = builder.finish().unwrap();
    let table = Arc::new(OwnedBytes::new(table));
    let slice = common::file_slice::FileSlice::new(table.clone());
    slice
 }
 pub fn criterion_benchmark(c: &mut Criterion) {
    {
        let slice = make_test_sstable(".suffix");
        let dict = Dictionary::<MonotonicU64SSTable>::open(slice.clone()).unwrap();
        c.bench_function("ord_to_term_suffix", |b| {
            let mut res = Vec::new();
            b.iter(|| {
                assert!(dict.ord_to_term(100_000, &mut res).unwrap());
                assert!(dict.ord_to_term(19_000_000, &mut res).unwrap());
            })
        });
        c.bench_function("open_and_ord_to_term_suffix", |b| {
            let mut res = Vec::new();
            b.iter(|| {
                let dict = Dictionary::<MonotonicU64SSTable>::open(slice.clone()).unwrap();
                assert!(dict.ord_to_term(100_000, &mut res).unwrap());
                assert!(dict.ord_to_term(19_000_000, &mut res).unwrap());
            })
        });
        c.bench_function("term_ord_suffix", |b| {
            b.iter(|| {
                assert_eq!(
                    dict.term_ord(b"prefix.00186A0.suffix").unwrap().unwrap(),
                    100_000
                );
                assert_eq!(
                    dict.term_ord(b"prefix.121EAC0.suffix").unwrap().unwrap(),
                    19_000_000
                );
            })
        });
        c.bench_function("open_and_term_ord_suffix", |b| {
            b.iter(|| {
                let dict = Dictionary::<MonotonicU64SSTable>::open(slice.clone()).unwrap();
                assert_eq!(
                    dict.term_ord(b"prefix.00186A0.suffix").unwrap().unwrap(),
                    100_000
                );
                assert_eq!(
                    dict.term_ord(b"prefix.121EAC0.suffix").unwrap().unwrap(),
                    19_000_000
                );
            })
        });
    }
    {
        let slice = make_test_sstable("");
        let dict = Dictionary::<MonotonicU64SSTable>::open(slice.clone()).unwrap();
        c.bench_function("ord_to_term", |b| {
            let mut res = Vec::new();
            b.iter(|| {
                assert!(dict.ord_to_term(100_000, &mut res).unwrap());
                assert!(dict.ord_to_term(19_000_000, &mut res).unwrap());
            })
        });
        c.bench_function("open_and_ord_to_term", |b| {
            let mut res = Vec::new();
            b.iter(|| {
                let dict = Dictionary::<MonotonicU64SSTable>::open(slice.clone()).unwrap();
                assert!(dict.ord_to_term(100_000, &mut res).unwrap());
                assert!(dict.ord_to_term(19_000_000, &mut res).unwrap());
            })
        });
        c.bench_function("term_ord", |b| {
            b.iter(|| {
                assert_eq!(dict.term_ord(b"prefix.00186A0").unwrap().unwrap(), 100_000);
                assert_eq!(
                    dict.term_ord(b"prefix.121EAC0").unwrap().unwrap(),
                    19_000_000
                );
            })
        });
        c.bench_function("open_and_term_ord", |b| {
            b.iter(|| {
                let dict = Dictionary::<MonotonicU64SSTable>::open(slice.clone()).unwrap();
                assert_eq!(dict.term_ord(b"prefix.00186A0").unwrap().unwrap(), 100_000);
                assert_eq!(
                    dict.term_ord(b"prefix.121EAC0").unwrap().unwrap(),
                    19_000_000
                );
            })
        });
    }
 }
 criterion_group!(benches, criterion_benchmark);
 criterion_main!(benches);
--- a/sstable/src/delta.rs
+++ b/sstable/src/delta.rs
@@ -20,7 +20,6 @@ where W: io::Write
    // Only here to avoid allocations.
    stateless_buffer: Vec<u8>,
    block_len: usize,
    compress: bool,
 }
 impl<W, TValueWriter> DeltaWriter<W, TValueWriter>
@@ -35,18 +34,6 @@ where
            value_writer: TValueWriter::default(),
            stateless_buffer: Vec::new(),
            block_len: BLOCK_LEN,
            compress: true,
        }
    }
    pub fn new_no_compression(wrt: W) -> Self {
        DeltaWriter {
            block: Vec::with_capacity(BLOCK_LEN * 2),
            write: CountingWriter::wrap(BufWriter::new(wrt)),
            value_writer: TValueWriter::default(),
            stateless_buffer: Vec::new(),
            block_len: BLOCK_LEN,
            compress: false,
        }
    }
@@ -66,7 +53,7 @@ where
        let block_len = buffer.len() + self.block.len();
-        if block_len > 2048 && self.compress {
+        if block_len > 2048 {
            buffer.extend_from_slice(&self.block);
            self.block.clear();
--- a/sstable/src/dictionary.rs
+++ b/sstable/src/dictionary.rs
@@ -9,8 +9,11 @@ use common::{BinarySerializable, OwnedBytes};
 use tantivy_fst::automaton::AlwaysMatch;
 use tantivy_fst::Automaton;
 use crate::sstable_index_v3::SSTableIndexV3Empty;
 use crate::streamer::{Streamer, StreamerBuilder};
-use crate::{BlockAddr, DeltaReader, Reader, SSTable, SSTableIndex, TermOrdinal, VoidSSTable};
+use crate::{
    BlockAddr, DeltaReader, Reader, SSTable, SSTableIndex, SSTableIndexV3, TermOrdinal, VoidSSTable,
 };
 /// An SSTable is a sorted map that associates sorted `&[u8]` keys
 /// to any kind of typed values.
@@ -128,144 +131,99 @@ impl<TSSTable: SSTable> Dictionary<TSSTable> {
        key_range: impl RangeBounds<[u8]>,
        limit: Option<u64>,
    ) -> FileSlice {
-        // we don't perform great when limit is set to a large value, and sometime we use u64::MAX
+        let first_block_id = match key_range.start_bound() {
-        // as a marker for no limit, so we'd better capture that.
+            Bound::Included(key) | Bound::Excluded(key) => {
-        // (not great means we decode up to the whole bottom layer index, which can take dozens of
+                let Some(first_block_id) = self.sstable_index.locate_with_key(key) else {
-        // ms on a 100m term dictionary)
+                    return FileSlice::empty();
-        let limit = limit.filter(|limit| *limit != u64::MAX);
+                };
-
+                Some(first_block_id)
-        // TODO replace unwraps with proper error handling
+            }
        let start_key = match key_range.start_bound() {
            Bound::Included(key) | Bound::Excluded(key) => key,
            Bound::Unbounded => &[],
        };
        let end_key = match key_range.end_bound() {
            Bound::Included(key) | Bound::Excluded(key) => Some(key),
            Bound::Unbounded => None,
        };
        let bounds = if let Some(limit) = limit {
            let mut sstable_iterator = self.sstable_index.iterate_from_key(start_key).unwrap();
            let Some(start_block) = sstable_iterator.value() else {
                // range_start is after end of table
                return FileSlice::empty();
            };
            if let Some(end_key) = end_key {
                if sstable_iterator.key().unwrap() >= end_key {
                    // the start and end keys are in the same block, return just that block
                    return self.sstable_slice.slice(start_block.byte_range.clone());
                }
            }
            let start_bound = start_block.byte_range.start;
-            sstable_iterator.advance().unwrap();
+        let last_block_id = match key_range.end_bound() {
-            let Some(second_block) = sstable_iterator.value() else {
+            Bound::Included(key) | Bound::Excluded(key) => self.sstable_index.locate_with_key(key),
-                // we reached the end of the sstable, return everything from start_bound
+            Bound::Unbounded => None,
                return self.sstable_slice.slice(start_bound..);
            };
            let mut end_bound = second_block.byte_range.end;
            if let Some(end_key) = end_key {
                if sstable_iterator.key().unwrap() >= end_key {
                    return self.sstable_slice.slice(start_bound..end_bound);
                }
            }
            let target_ord = second_block.first_ordinal + limit;
            while sstable_iterator.advance().unwrap() {
                let block = sstable_iterator.value().unwrap();
                if block.first_ordinal >= target_ord {
                    break;
                }
                end_bound = block.byte_range.end;
                if let Some(end_key) = end_key {
                    if sstable_iterator.key().unwrap() >= end_key {
                        break;
                    }
                }
            }
            let start_bound = Bound::Included(start_bound);
            let end_bound = Bound::Excluded(end_bound);
            (start_bound, end_bound)
        } else {
            let Some(start_block) = self.sstable_index.get_block_with_key(start_key).unwrap()
            else {
                // range_start is after end of table
                return FileSlice::empty();
            };
            let start_bound = Bound::Included(start_block.byte_range.start);
            let end_bound = if let Some(end_key) = end_key {
                if let Some(end_block) = self.sstable_index.get_block_with_key(end_key).unwrap() {
                    Bound::Excluded(end_block.byte_range.end)
                } else {
                    Bound::Unbounded
                }
            } else {
                Bound::Unbounded
            };
            (start_bound, end_bound)
        };
-        self.sstable_slice.slice(bounds)
+        let start_bound = if let Some(first_block_id) = first_block_id {
            let Some(block_addr) = self.sstable_index.get_block(first_block_id) else {
                return FileSlice::empty();
            };
            Bound::Included(block_addr.byte_range.start)
        } else {
            Bound::Unbounded
        };
        let last_block_id = if let Some(limit) = limit {
            let second_block_id = first_block_id.map(|id| id + 1).unwrap_or(0);
            if let Some(block_addr) = self.sstable_index.get_block(second_block_id) {
                let ordinal_limit = block_addr.first_ordinal + limit;
                let last_block_limit = self.sstable_index.locate_with_ord(ordinal_limit);
                if let Some(last_block_id) = last_block_id {
                    Some(last_block_id.min(last_block_limit))
                } else {
                    Some(last_block_limit)
                }
            } else {
                last_block_id
            }
        } else {
            last_block_id
        };
        let end_bound = last_block_id
            .and_then(|block_id| self.sstable_index.get_block(block_id))
            .map(|block_addr| Bound::Excluded(block_addr.byte_range.end))
            .unwrap_or(Bound::Unbounded);
        self.sstable_slice.slice((start_bound, end_bound))
    }
    /// Opens a `TermDictionary`.
    pub fn open(term_dictionary_file: FileSlice) -> io::Result<Self> {
        let (main_slice, footer_len_slice) = term_dictionary_file.split_from_end(20);
        let mut footer_len_bytes: OwnedBytes = footer_len_slice.read_bytes()?;
        // let layer_count = u32::deserialize(&mut footer_len_bytes)?;
        let index_offset = u64::deserialize(&mut footer_len_bytes)?;
        let num_terms = u64::deserialize(&mut footer_len_bytes)?;
        let version = u32::deserialize(&mut footer_len_bytes)?;
        let (sstable_slice, index_slice) = main_slice.split(index_offset as usize);
-        match version {
+        let sstable_index_bytes = index_slice.read_bytes()?;
            2 => {
                // previous format, kept for backward compatibility
                let sstable_index_bytes = index_slice.read_bytes()?;
                // on the old format, the 1st layer necessarily start immediately, and there is
                // only a single layer
                let sstable_index =
                    SSTableIndex::load(sstable_index_bytes, 1, 0).map_err(|_| {
                        io::Error::new(io::ErrorKind::InvalidData, "SSTable corruption")
                    })?;
                Ok(Dictionary {
                    sstable_slice,
                    sstable_index,
                    num_terms,
                    phantom_data: PhantomData,
                })
            }
            3 => {
                let sstable_index_bytes = index_slice.read_bytes()?;
                let (sstable_index_bytes, mut v3_footer_bytes) = sstable_index_bytes.rsplit(12);
                let first_layer_offset = v3_footer_bytes.read_u64();
                let layer_count = v3_footer_bytes.read_u32();
-                let sstable_index = SSTableIndex::load(
+        let sstable_index = match version {
-                    sstable_index_bytes,
+            2 => SSTableIndex::V2(
-                    layer_count,
+                crate::sstable_index_v2::SSTableIndex::load(sstable_index_bytes).map_err(|_| {
-                    first_layer_offset as usize,
+                    io::Error::new(io::ErrorKind::InvalidData, "SSTable corruption")
-                )
+                })?,
-                .map_err(|_| io::Error::new(io::ErrorKind::InvalidData, "SSTable corruption"))?;
+            ),
-                Ok(Dictionary {
+            3 => {
-                    sstable_slice,
+                let (sstable_index_bytes, mut footerv3_len_bytes) = sstable_index_bytes.rsplit(8);
-                    sstable_index,
+                let store_offset = u64::deserialize(&mut footerv3_len_bytes)?;
-                    num_terms,
+                if store_offset != 0 {
-                    phantom_data: PhantomData,
+                    SSTableIndex::V3(
-                })
+                        SSTableIndexV3::load(sstable_index_bytes, store_offset).map_err(|_| {
                            io::Error::new(io::ErrorKind::InvalidData, "SSTable corruption")
                        })?,
                    )
                } else {
                    // if store_offset is zero, there is no index, so we build a pseudo-index
                    // assuming a single block of sstable covering everything.
                    SSTableIndex::V3Empty(SSTableIndexV3Empty::load(index_offset as usize))
                }
            }
            _ => {
                return Err(io::Error::new(
                    io::ErrorKind::Other,
-                    format!(
+                    format!("Unsuported sstable version, expected one of [2, 3], found {version}"),
                        "Unsuported sstable version, expected {}, found {version}",
                        crate::SSTABLE_VERSION,
                    ),
                ))
            }
-        }
+        };
        Ok(Dictionary {
            sstable_slice,
            sstable_index,
            num_terms,
            phantom_data: PhantomData,
        })
    }
    /// Creates a term dictionary from the supplied bytes.
@@ -289,17 +247,68 @@ impl<TSSTable: SSTable> Dictionary<TSSTable> {
        self.num_terms as usize
    }
    /// Decode a DeltaReader up to key, returning the number of terms traversed
    ///
    /// If the key was not found, returns Ok(None).
    /// After calling this function, it is possible to call `DeltaReader::value` to get the
    /// associated value.
    fn decode_up_to_key<K: AsRef<[u8]>>(
        &self,
        key: K,
        sstable_delta_reader: &mut DeltaReader<TSSTable::ValueReader>,
    ) -> io::Result<Option<TermOrdinal>> {
        let mut term_ord = 0;
        let key_bytes = key.as_ref();
        let mut ok_bytes = 0;
        while sstable_delta_reader.advance()? {
            let prefix_len = sstable_delta_reader.common_prefix_len();
            let suffix = sstable_delta_reader.suffix();
            match prefix_len.cmp(&ok_bytes) {
                Ordering::Less => return Ok(None), // popped bytes already matched => too far
                Ordering::Equal => (),
                Ordering::Greater => {
                    // the ok prefix is less than current entry prefix => continue to next elem
                    term_ord += 1;
                    continue;
                }
            }
            // we have ok_bytes byte of common prefix, check if this key adds more
            for (key_byte, suffix_byte) in key_bytes[ok_bytes..].iter().zip(suffix) {
                match suffix_byte.cmp(key_byte) {
                    Ordering::Less => break,              // byte too small
                    Ordering::Equal => ok_bytes += 1,     // new matching byte
                    Ordering::Greater => return Ok(None), // too far
                }
            }
            if ok_bytes == key_bytes.len() {
                if prefix_len + suffix.len() == ok_bytes {
                    return Ok(Some(term_ord));
                } else {
                    // current key is a prefix of current element, not a match
                    return Ok(None);
                }
            }
            term_ord += 1;
        }
        Ok(None)
    }
    /// Returns the ordinal associated with a given term.
    pub fn term_ord<K: AsRef<[u8]>>(&self, key: K) -> io::Result<Option<TermOrdinal>> {
        let key_bytes = key.as_ref();
-        let Some(block_addr) = self.sstable_index.get_block_with_key(key_bytes)? else {
+        let Some(block_addr) = self.sstable_index.get_block_with_key(key_bytes) else {
            return Ok(None);
        };
        let first_ordinal = block_addr.first_ordinal;
        let mut sstable_delta_reader = self.sstable_delta_reader_block(block_addr)?;
-        decode_up_to_key(key_bytes, &mut sstable_delta_reader)
+        self.decode_up_to_key(key_bytes, &mut sstable_delta_reader)
            .map(|opt| opt.map(|ord| ord + first_ordinal))
    }
@@ -314,7 +323,7 @@ impl<TSSTable: SSTable> Dictionary<TSSTable> {
    /// the buffer may be modified.
    pub fn ord_to_term(&self, ord: TermOrdinal, bytes: &mut Vec<u8>) -> io::Result<bool> {
        // find block in which the term would be
-        let block_addr = self.sstable_index.get_block_with_ord(ord)?;
+        let block_addr = self.sstable_index.get_block_with_ord(ord);
        let first_ordinal = block_addr.first_ordinal;
        // then search inside that block only
@@ -332,7 +341,7 @@ impl<TSSTable: SSTable> Dictionary<TSSTable> {
    /// Returns the number of terms in the dictionary.
    pub fn term_info_from_ord(&self, term_ord: TermOrdinal) -> io::Result<Option<TSSTable::Value>> {
        // find block in which the term would be
-        let block_addr = self.sstable_index.get_block_with_ord(term_ord)?;
+        let block_addr = self.sstable_index.get_block_with_ord(term_ord);
        let first_ordinal = block_addr.first_ordinal;
        // then search inside that block only
@@ -347,7 +356,7 @@ impl<TSSTable: SSTable> Dictionary<TSSTable> {
    /// Lookups the value corresponding to the key.
    pub fn get<K: AsRef<[u8]>>(&self, key: K) -> io::Result<Option<TSSTable::Value>> {
-        if let Some(block_addr) = self.sstable_index.get_block_with_key(key.as_ref())? {
+        if let Some(block_addr) = self.sstable_index.get_block_with_key(key.as_ref()) {
            let sstable_reader = self.sstable_delta_reader_block(block_addr)?;
            return self.do_get(key, sstable_reader);
        }
@@ -356,7 +365,7 @@ impl<TSSTable: SSTable> Dictionary<TSSTable> {
    /// Lookups the value corresponding to the key.
    pub async fn get_async<K: AsRef<[u8]>>(&self, key: K) -> io::Result<Option<TSSTable::Value>> {
-        if let Some(block_addr) = self.sstable_index.get_block_with_key(key.as_ref())? {
+        if let Some(block_addr) = self.sstable_index.get_block_with_key(key.as_ref()) {
            let sstable_reader = self.sstable_delta_reader_block_async(block_addr).await?;
            return self.do_get(key, sstable_reader);
        }
@@ -368,7 +377,7 @@ impl<TSSTable: SSTable> Dictionary<TSSTable> {
        key: K,
        mut reader: DeltaReader<TSSTable::ValueReader>,
    ) -> io::Result<Option<TSSTable::Value>> {
-        if let Some(_ord) = decode_up_to_key(key, &mut reader)? {
+        if let Some(_ord) = self.decode_up_to_key(key, &mut reader)? {
            Ok(Some(reader.value().clone()))
        } else {
            Ok(None)
@@ -405,56 +414,6 @@ impl<TSSTable: SSTable> Dictionary<TSSTable> {
    }
 }
 /// Decode a DeltaReader up to key, returning the number of terms traversed
 ///
 /// If the key was not found, returns Ok(None).
 /// After calling this function, it is possible to call `DeltaReader::value` to get the
 /// associated value.
 pub(crate) fn decode_up_to_key<K: AsRef<[u8]>, TValueReader: crate::ValueReader>(
    key: K,
    sstable_delta_reader: &mut DeltaReader<TValueReader>,
 ) -> io::Result<Option<TermOrdinal>> {
    let mut term_ord = 0;
    let key_bytes = key.as_ref();
    let mut ok_bytes = 0;
    while sstable_delta_reader.advance()? {
        let prefix_len = sstable_delta_reader.common_prefix_len();
        let suffix = sstable_delta_reader.suffix();
        match prefix_len.cmp(&ok_bytes) {
            Ordering::Less => return Ok(None), // popped bytes already matched => too far
            Ordering::Equal => (),
            Ordering::Greater => {
                // the ok prefix is less than current entry prefix => continue to next elem
                term_ord += 1;
                continue;
            }
        }
        // we have ok_bytes byte of common prefix, check if this key adds more
        for (key_byte, suffix_byte) in key_bytes[ok_bytes..].iter().zip(suffix) {
            match suffix_byte.cmp(key_byte) {
                Ordering::Less => break,              // byte too small
                Ordering::Equal => ok_bytes += 1,     // new matching byte
                Ordering::Greater => return Ok(None), // too far
            }
        }
        if ok_bytes == key_bytes.len() {
            if prefix_len + suffix.len() == ok_bytes {
                return Ok(Some(term_ord));
            } else {
                // current key is a prefix of current element, not a match
                return Ok(None);
            }
        }
        term_ord += 1;
    }
    Ok(None)
 }
 #[cfg(test)]
 mod tests {
    use std::ops::Range;
@@ -520,6 +479,8 @@ mod tests {
        let dictionary = Dictionary::<MonotonicU64SSTable>::open(slice).unwrap();
        // if the last block is id 0, tests are meaningless
        assert_ne!(dictionary.sstable_index.locate_with_ord(u64::MAX), 0);
        assert_eq!(dictionary.num_terms(), 0x3ffff);
        (dictionary, table)
    }
@@ -528,7 +489,7 @@ mod tests {
    fn test_ord_term_conversion() {
        let (dic, slice) = make_test_sstable();
-        let block = dic.sstable_index.get_block_with_ord(100_000).unwrap();
+        let block = dic.sstable_index.get_block_with_ord(100_000);
        slice.restrict(block.byte_range);
        let mut res = Vec::new();
@@ -554,11 +515,7 @@ mod tests {
        // end of a block
        let ordinal = block.first_ordinal - 1;
-        let new_range = dic
+        let new_range = dic.sstable_index.get_block_with_ord(ordinal).byte_range;
            .sstable_index
            .get_block_with_ord(ordinal)
            .unwrap()
            .byte_range;
        slice.restrict(new_range);
        assert!(dic.ord_to_term(ordinal, &mut res).unwrap());
        assert_eq!(res, format!("{ordinal:05X}").into_bytes());
@@ -568,7 +525,7 @@ mod tests {
        // before first block
        // 1st block must be loaded for key-related operations
-        let block = dic.sstable_index.get_block_with_ord(0).unwrap();
+        let block = dic.sstable_index.get_block_with_ord(0);
        slice.restrict(block.byte_range);
        assert!(dic.get(b"$$$").unwrap().is_none());
@@ -577,11 +534,7 @@ mod tests {
        // after last block
        // last block must be loaded for ord related operations
        let ordinal = 0x40000 + 10;
-        let new_range = dic
+        let new_range = dic.sstable_index.get_block_with_ord(ordinal).byte_range;
            .sstable_index
            .get_block_with_ord(ordinal)
            .unwrap()
            .byte_range;
        slice.restrict(new_range);
        assert!(!dic.ord_to_term(ordinal, &mut res).unwrap());
        assert!(dic.term_info_from_ord(ordinal).unwrap().is_none());
@@ -606,13 +559,11 @@ mod tests {
            .sstable_index
            .get_block_with_key(b"10000")
            .unwrap()
            .unwrap()
            .byte_range;
        let end = dic
            .sstable_index
            .get_block_with_key(b"18000")
            .unwrap()
            .unwrap()
            .byte_range;
        slice.restrict(start.start..end.end);
--- a/sstable/src/lib.rs
+++ b/sstable/src/lib.rs
@@ -1,6 +1,6 @@
 use std::io::{self, Write};
 use std::num::NonZeroU64;
 use std::ops::Range;
 use std::usize;
 use merge::ValueMerger;
@@ -10,8 +10,9 @@ pub mod merge;
 mod streamer;
 pub mod value;
-mod sstable_index;
+mod sstable_index_v3;
-pub use sstable_index::{BlockAddr, SSTableIndex, SSTableIndexBuilder};
+pub use sstable_index_v3::{BlockAddr, SSTableIndex, SSTableIndexBuilder, SSTableIndexV3};
 mod sstable_index_v2;
 pub(crate) mod vint;
 pub use dictionary::Dictionary;
 pub use streamer::{Streamer, StreamerBuilder};
@@ -30,12 +31,6 @@ pub type TermOrdinal = u64;
 const DEFAULT_KEY_CAPACITY: usize = 50;
 const SSTABLE_VERSION: u32 = 3;
 // TODO tune that value. Maybe it's too little?
 #[cfg(not(test))]
 const DEFAULT_MAX_ROOT_BLOCKS: NonZeroU64 = unsafe { NonZeroU64::new_unchecked(32) };
 #[cfg(test)]
 const DEFAULT_MAX_ROOT_BLOCKS: NonZeroU64 = unsafe { NonZeroU64::new_unchecked(1) };
 /// Given two byte string returns the length of
 /// the longest common prefix.
 fn common_prefix_len(left: &[u8], right: &[u8]) -> usize {
@@ -61,7 +56,7 @@ pub trait SSTable: Sized {
    }
    fn writer<W: io::Write>(wrt: W) -> Writer<W, Self::ValueWriter> {
-        Writer::new(wrt, DEFAULT_MAX_ROOT_BLOCKS)
+        Writer::new(wrt)
    }
    fn delta_reader(reader: OwnedBytes) -> DeltaReader<Self::ValueReader> {
@@ -184,7 +179,6 @@ where W: io::Write
    delta_writer: DeltaWriter<W, TValueWriter>,
    num_terms: u64,
    first_ordinal_of_the_block: u64,
    index_max_root_blocks: NonZeroU64,
 }
 impl<W, TValueWriter> Writer<W, TValueWriter>
@@ -197,18 +191,17 @@ where
    /// TODO remove this function. (See Issue #1727)
    #[doc(hidden)]
    pub fn create(wrt: W) -> io::Result<Self> {
-        Ok(Self::new(wrt, DEFAULT_MAX_ROOT_BLOCKS))
+        Ok(Self::new(wrt))
    }
    /// Creates a new `TermDictionaryBuilder`.
-    pub fn new(wrt: W, index_max_root_blocks: NonZeroU64) -> Self {
+    pub fn new(wrt: W) -> Self {
        Writer {
            previous_key: Vec::with_capacity(DEFAULT_KEY_CAPACITY),
            num_terms: 0u64,
            index_builder: SSTableIndexBuilder::default(),
            delta_writer: DeltaWriter::new(wrt),
            first_ordinal_of_the_block: 0u64,
            index_max_root_blocks,
        }
    }
@@ -310,15 +303,11 @@ where
        // add a final empty block as an end marker
        wrt.write_all(&0u32.to_le_bytes())?;
-        let index_offset = wrt.written_bytes();
+        let offset = wrt.written_bytes();
-        let (layer_count, layer_offset): (u32, u64) = self
+        let fst_len: u64 = self.index_builder.serialize(&mut wrt)?;
-            .index_builder
+        wrt.write_all(&fst_len.to_le_bytes())?;
-            .serialize(&mut wrt, self.index_max_root_blocks)?;
+        wrt.write_all(&offset.to_le_bytes())?;
        wrt.write_all(&layer_offset.to_le_bytes())?;
        wrt.write_all(&layer_count.to_le_bytes())?;
        wrt.write_all(&index_offset.to_le_bytes())?;
        wrt.write_all(&self.num_terms.to_le_bytes())?;
        SSTABLE_VERSION.serialize(&mut wrt)?;
@@ -398,12 +387,7 @@ mod test {
                16, 17, 33, 18, 19, 17, 20, // data block
                0, 0, 0, 0, // no more block
                // index
-                8, 0, 0, 0, // size of index block
+                0, 0, 0, 0, 0, 0, 0, 0, // fst lenght
                0, // compression
                1, 0, 12, 0, 32, 17, 20, // index block
                0, 0, 0, 0, // no more index block
                0, 0, 0, 0, 0, 0, 0, 0, // first layer offset
                1, 0, 0, 0, // layer count
                16, 0, 0, 0, 0, 0, 0, 0, // index start offset
                3, 0, 0, 0, 0, 0, 0, 0, // num term
                3, 0, 0, 0, // version
--- a/sstable/src/sstable_index.rs
+++ b/sstable/src/sstable_index.rs
@@ -1,487 +0,0 @@
 use std::io::{self, Write};
 use std::ops::Range;
 use common::OwnedBytes;
 use crate::{common_prefix_len, SSTable, SSTableDataCorruption, TermOrdinal};
 #[derive(Debug, Clone)]
 pub struct SSTableIndex {
    root_blocks: Vec<BlockMeta>,
    layer_count: u32,
    index_bytes: OwnedBytes,
 }
 impl Default for SSTableIndex {
    fn default() -> Self {
        SSTableIndex {
            root_blocks: Vec::new(),
            layer_count: 1,
            index_bytes: OwnedBytes::empty(),
        }
    }
 }
 impl SSTableIndex {
    /// Load an index from its binary representation
    pub fn load(
        data: OwnedBytes,
        layer_count: u32,
        first_layer_offset: usize,
    ) -> Result<SSTableIndex, SSTableDataCorruption> {
        let (index_bytes, first_layer_slice) = data.split(first_layer_offset);
        let mut reader = IndexSSTable::reader(first_layer_slice);
        let mut root_blocks = Vec::new();
        while reader.advance().map_err(|_| SSTableDataCorruption)? {
            root_blocks.push(BlockMeta {
                last_key_or_greater: reader.key().to_vec(),
                block_addr: reader.value().clone(),
            });
        }
        Ok(SSTableIndex {
            root_blocks,
            layer_count,
            index_bytes,
            // index_bytes: OwnedBytes::empty(),
        })
    }
    /// Get the [`BlockAddr`] of the block that would contain `key`.
    ///
    /// Returns None if `key` is lexicographically after the last key recorded.
    pub fn get_block_with_key(&self, key: &[u8]) -> io::Result<Option<BlockAddr>> {
        self.iterate_from_key(key).map(|iter| iter.value().cloned())
    }
    /// Get the [`BlockAddr`] of the block containing the `ord`-th term.
    pub fn get_block_with_ord(&self, ord: TermOrdinal) -> io::Result<BlockAddr> {
        let pos = self
            .root_blocks
            .binary_search_by_key(&ord, |block| block.block_addr.first_ordinal);
        let root_pos = match pos {
            Ok(pos) => pos,
            // Err(0) can't happen as the sstable starts with ordinal zero
            Err(pos) => pos - 1,
        };
        if self.layer_count == 1 {
            return Ok(self.root_blocks[root_pos].block_addr.clone());
        }
        let mut next_layer_block_addr = self.root_blocks[root_pos].block_addr.clone();
        for _ in 1..self.layer_count {
            let mut sstable_delta_reader = IndexSSTable::delta_reader(
                self.index_bytes
                    .slice(next_layer_block_addr.byte_range.clone()),
            );
            while sstable_delta_reader.advance()? {
                if sstable_delta_reader.value().first_ordinal > ord {
                    break;
                }
                next_layer_block_addr = sstable_delta_reader.value().clone();
            }
        }
        Ok(next_layer_block_addr)
    }
    pub(crate) fn iterate_from_key(&self, key: &[u8]) -> io::Result<ReaderOrSlice<'_>> {
        let root_pos = self
            .root_blocks
            .binary_search_by_key(&key, |block| &block.last_key_or_greater);
        let root_pos = match root_pos {
            Ok(pos) => pos,
            Err(pos) => {
                if pos < self.root_blocks.len() {
                    pos
                } else {
                    // after end of last block: no block matches
                    return Ok(ReaderOrSlice::End);
                }
            }
        };
        let mut next_layer_block_addr = self.root_blocks[root_pos].block_addr.clone();
        let mut last_delta_reader = None;
        for _ in 1..self.layer_count {
            // we don't enter this loop for 1 layer index
            let mut sstable_delta_reader = IndexSSTable::delta_reader(
                self.index_bytes.slice(next_layer_block_addr.byte_range),
            );
            crate::dictionary::decode_up_to_key(key, &mut sstable_delta_reader)?;
            next_layer_block_addr = sstable_delta_reader.value().clone();
            last_delta_reader = Some(sstable_delta_reader);
        }
        if let Some(delta_reader) = last_delta_reader {
            // reconstruct the current key. We stopped either on the exact key, or just after
            // either way, common_prefix_len is something that did not change between the
            // last-key-before-target and the current pos, so those bytes must match the prefix of
            // `key`. The next bytes can be obtained from the delta reader
            let mut result_key = Vec::with_capacity(crate::DEFAULT_KEY_CAPACITY);
            let common_prefix_len = delta_reader.common_prefix_len();
            let suffix = delta_reader.suffix();
            let new_len = delta_reader.common_prefix_len() + suffix.len();
            result_key.resize(new_len, 0u8);
            result_key[..common_prefix_len].copy_from_slice(&key[..common_prefix_len]);
            result_key[common_prefix_len..].copy_from_slice(suffix);
            let reader = crate::Reader {
                key: result_key,
                delta_reader,
            };
            Ok(ReaderOrSlice::Reader(reader))
        } else {
            // self.layer_count == 1, there is no lvl2 sstable to decode.
            Ok(ReaderOrSlice::Iter(&self.root_blocks, root_pos))
        }
    }
 }
 pub(crate) enum ReaderOrSlice<'a> {
    Reader(crate::Reader<crate::value::index::IndexValueReader>),
    Iter(&'a [BlockMeta], usize),
    End,
 }
 impl<'a> ReaderOrSlice<'a> {
    pub fn advance(&mut self) -> Result<bool, SSTableDataCorruption> {
        match self {
            ReaderOrSlice::Reader(reader) => {
                let res = reader.advance().map_err(|_| SSTableDataCorruption);
                if !matches!(res, Ok(true)) {
                    *self = ReaderOrSlice::End;
                }
                res
            }
            ReaderOrSlice::Iter(slice, index) => {
                *index += 1;
                if *index < slice.len() {
                    Ok(true)
                } else {
                    *self = ReaderOrSlice::End;
                    Ok(false)
                }
            }
            ReaderOrSlice::End => Ok(false),
        }
    }
    /// Get current key. Always Some(_) unless last call to advance returned something else than
    /// Ok(true)
    pub fn key(&self) -> Option<&[u8]> {
        match self {
            ReaderOrSlice::Reader(reader) => Some(reader.key()),
            ReaderOrSlice::Iter(slice, index) => Some(&slice[*index].last_key_or_greater),
            ReaderOrSlice::End => None,
        }
    }
    /// Get current value. Always Some(_) unless last call to advance returned something else than
    /// Ok(true)
    pub fn value(&self) -> Option<&BlockAddr> {
        match self {
            ReaderOrSlice::Reader(reader) => Some(reader.value()),
            ReaderOrSlice::Iter(slice, index) => Some(&slice[*index].block_addr),
            ReaderOrSlice::End => None,
        }
    }
 }
 #[derive(Clone, Eq, PartialEq, Debug)]
 pub struct BlockAddr {
    pub byte_range: Range<usize>,
    pub first_ordinal: u64,
 }
 #[derive(Debug, Clone)]
 pub(crate) struct BlockMeta {
    /// Any byte string that is lexicographically greater or equal to
    /// the last key in the block,
    /// and yet strictly smaller than the first key in the next block.
    pub last_key_or_greater: Vec<u8>,
    pub block_addr: BlockAddr,
 }
 #[derive(Default)]
 pub struct SSTableIndexBuilder {
    index: SSTableIndex,
 }
 /// Given that left < right,
 /// mutates `left into a shorter byte string left'` that
 /// matches `left <= left' < right`.
 fn find_shorter_str_in_between(left: &mut Vec<u8>, right: &[u8]) {
    assert!(&left[..] < right);
    let common_len = common_prefix_len(left, right);
    if left.len() == common_len {
        return;
    }
    // It is possible to do one character shorter in some case,
    // but it is not worth the extra complexity
    for pos in (common_len + 1)..left.len() {
        if left[pos] != u8::MAX {
            left[pos] += 1;
            left.truncate(pos + 1);
            return;
        }
    }
 }
 impl SSTableIndexBuilder {
    /// In order to make the index as light as possible, we
    /// try to find a shorter alternative to the last key of the last block
    /// that is still smaller than the next key.
    pub(crate) fn shorten_last_block_key_given_next_key(&mut self, next_key: &[u8]) {
        if let Some(last_block) = self.index.root_blocks.last_mut() {
            find_shorter_str_in_between(&mut last_block.last_key_or_greater, next_key);
        }
    }
    pub fn add_block(&mut self, last_key: &[u8], byte_range: Range<usize>, first_ordinal: u64) {
        self.index.root_blocks.push(BlockMeta {
            last_key_or_greater: last_key.to_vec(),
            block_addr: BlockAddr {
                byte_range,
                first_ordinal,
            },
        })
    }
    pub fn serialize<W: std::io::Write>(
        &self,
        wrt: W,
        index_max_root_blocks: std::num::NonZeroU64,
    ) -> io::Result<(u32, u64)> {
        let index_max_root_blocks = index_max_root_blocks.get();
        let mut wrt = common::CountingWriter::wrap(wrt);
        let mut next_layer = write_sstable_layer(&mut wrt, &self.index.root_blocks, 0)?;
        let mut layer_count = 1;
        let mut offset = 0;
        while next_layer.len() as u64 > index_max_root_blocks {
            offset = wrt.written_bytes();
            layer_count += 1;
            next_layer = write_sstable_layer(&mut wrt, &next_layer, offset as usize)?;
        }
        Ok((layer_count, offset))
    }
 }
 fn write_sstable_layer<W: std::io::Write>(
    wrt: W,
    layer_content: &[BlockMeta],
    offset: usize,
 ) -> io::Result<Vec<BlockMeta>> {
    // we can't use a plain writer as it would generate an index
    // also disable compression, the index is small anyway, and it's the most costly part of
    // opening that kind of sstable
    let mut sstable_writer =
        crate::DeltaWriter::<_, crate::value::index::IndexValueWriter>::new_no_compression(wrt);
    // in tests, set a smaller block size to stress-test
    #[cfg(test)]
    sstable_writer.set_block_len(16);
    let mut next_layer = Vec::new();
    let mut previous_key = Vec::with_capacity(crate::DEFAULT_KEY_CAPACITY);
    let mut first_ordinal = None;
    for block in layer_content.iter() {
        if first_ordinal.is_none() {
            first_ordinal = Some(block.block_addr.first_ordinal);
        }
        let keep_len = common_prefix_len(&previous_key, &block.last_key_or_greater);
        sstable_writer.write_suffix(keep_len, &block.last_key_or_greater[keep_len..]);
        sstable_writer.write_value(&block.block_addr);
        if let Some(range) = sstable_writer.flush_block_if_required()? {
            let real_range = (range.start + offset)..(range.end + offset);
            let block_meta = BlockMeta {
                last_key_or_greater: block.last_key_or_greater.clone(),
                block_addr: BlockAddr {
                    byte_range: real_range,
                    first_ordinal: first_ordinal.take().unwrap(),
                },
            };
            next_layer.push(block_meta);
            previous_key.clear();
        } else {
            previous_key.extend_from_slice(&block.last_key_or_greater);
            previous_key.resize(block.last_key_or_greater.len(), 0u8);
            previous_key[keep_len..].copy_from_slice(&block.last_key_or_greater[keep_len..]);
        }
    }
    if let Some(range) = sstable_writer.flush_block()? {
        if let Some(last_block) = layer_content.last() {
            // not going here means an empty table (?!)
            let real_range = (range.start + offset)..(range.end + offset);
            let block_meta = BlockMeta {
                last_key_or_greater: last_block.last_key_or_greater.clone(),
                block_addr: BlockAddr {
                    byte_range: real_range,
                    first_ordinal: first_ordinal.take().unwrap(),
                },
            };
            next_layer.push(block_meta);
        }
    }
    sstable_writer.finish().write_all(&0u32.to_le_bytes())?;
    Ok(next_layer)
 }
 /// SSTable representing an index
 ///
 /// `last_key_or_greater` is used as the key, the value contains the
 /// length and first ordinal of each block. The start offset is implicitly
 /// obtained from lengths.
 struct IndexSSTable;
 impl SSTable for IndexSSTable {
    type Value = BlockAddr;
    type ValueReader = crate::value::index::IndexValueReader;
    type ValueWriter = crate::value::index::IndexValueWriter;
 }
 #[cfg(test)]
 mod tests {
    use common::OwnedBytes;
    use super::{BlockAddr, SSTableIndex, SSTableIndexBuilder};
    use crate::SSTableDataCorruption;
    #[test]
    fn test_sstable_index() {
        let mut sstable_builder = SSTableIndexBuilder::default();
        sstable_builder.add_block(b"aaa", 10..20, 0u64);
        sstable_builder.add_block(b"bbbbbbb", 20..30, 5u64);
        sstable_builder.add_block(b"ccc", 30..40, 10u64);
        sstable_builder.add_block(b"dddd", 40..50, 15u64);
        let mut buffer: Vec<u8> = Vec::new();
        sstable_builder
            .serialize(&mut buffer, crate::DEFAULT_MAX_ROOT_BLOCKS)
            .unwrap();
        let buffer = OwnedBytes::new(buffer);
        let sstable_index = SSTableIndex::load(buffer, 1, 0).unwrap();
        assert_eq!(
            sstable_index.get_block_with_key(b"bbbde").unwrap(),
            Some(BlockAddr {
                first_ordinal: 10u64,
                byte_range: 30..40
            })
        );
        assert_eq!(
            sstable_index
                .get_block_with_key(b"aa")
                .unwrap()
                .unwrap()
                .first_ordinal,
            0
        );
        assert_eq!(
            sstable_index
                .get_block_with_key(b"aaa")
                .unwrap()
                .unwrap()
                .first_ordinal,
            0
        );
        assert_eq!(
            sstable_index
                .get_block_with_key(b"aab")
                .unwrap()
                .unwrap()
                .first_ordinal,
            5
        );
        assert_eq!(
            sstable_index
                .get_block_with_key(b"ccc")
                .unwrap()
                .unwrap()
                .first_ordinal,
            10
        );
        assert!(sstable_index.get_block_with_key(b"e").unwrap().is_none());
        assert_eq!(
            sstable_index.get_block_with_ord(0).unwrap().first_ordinal,
            0
        );
        assert_eq!(
            sstable_index.get_block_with_ord(1).unwrap().first_ordinal,
            0
        );
        assert_eq!(
            sstable_index.get_block_with_ord(4).unwrap().first_ordinal,
            0
        );
        assert_eq!(
            sstable_index.get_block_with_ord(5).unwrap().first_ordinal,
            5
        );
        assert_eq!(
            sstable_index.get_block_with_ord(6).unwrap().first_ordinal,
            5
        );
        assert_eq!(
            sstable_index.get_block_with_ord(100).unwrap().first_ordinal,
            15
        );
    }
    #[test]
    fn test_sstable_with_corrupted_data() {
        let mut sstable_builder = SSTableIndexBuilder::default();
        sstable_builder.add_block(b"aaa", 10..20, 0u64);
        sstable_builder.add_block(b"bbbbbbb", 20..30, 5u64);
        sstable_builder.add_block(b"ccc", 30..40, 10u64);
        sstable_builder.add_block(b"dddd", 40..50, 15u64);
        let mut buffer: Vec<u8> = Vec::new();
        sstable_builder
            .serialize(&mut buffer, crate::DEFAULT_MAX_ROOT_BLOCKS)
            .unwrap();
        buffer[2] = 9u8;
        let buffer = OwnedBytes::new(buffer);
        let data_corruption_err = SSTableIndex::load(buffer, 1, 0).err().unwrap();
        assert!(matches!(data_corruption_err, SSTableDataCorruption));
    }
    #[track_caller]
    fn test_find_shorter_str_in_between_aux(left: &[u8], right: &[u8]) {
        let mut left_buf = left.to_vec();
        super::find_shorter_str_in_between(&mut left_buf, right);
        assert!(left_buf.len() <= left.len());
        assert!(left <= &left_buf);
        assert!(&left_buf[..] < right);
    }
    #[test]
    fn test_find_shorter_str_in_between() {
        test_find_shorter_str_in_between_aux(b"", b"hello");
        test_find_shorter_str_in_between_aux(b"abc", b"abcd");
        test_find_shorter_str_in_between_aux(b"abcd", b"abd");
        test_find_shorter_str_in_between_aux(&[0, 0, 0], &[1]);
        test_find_shorter_str_in_between_aux(&[0, 0, 0], &[0, 0, 1]);
        test_find_shorter_str_in_between_aux(&[0, 0, 255, 255, 255, 0u8], &[0, 1]);
    }
    use proptest::prelude::*;
    proptest! {
        #![proptest_config(ProptestConfig::with_cases(100))]
        #[test]
        fn test_proptest_find_shorter_str(left in any::<Vec<u8>>(), right in any::<Vec<u8>>()) {
            if left < right {
                test_find_shorter_str_in_between_aux(&left, &right);
            }
        }
    }
 }
--- a/sstable/src/sstable_index_v2.rs
+++ b/sstable/src/sstable_index_v2.rs
@@ -0,0 +1,101 @@
 use common::OwnedBytes;
 use crate::{BlockAddr, SSTable, SSTableDataCorruption, TermOrdinal};
 #[derive(Default, Debug, Clone)]
 pub struct SSTableIndex {
    blocks: Vec<BlockMeta>,
 }
 impl SSTableIndex {
    /// Load an index from its binary representation
    pub fn load(data: OwnedBytes) -> Result<SSTableIndex, SSTableDataCorruption> {
        let mut reader = IndexSSTable::reader(data);
        let mut blocks = Vec::new();
        while reader.advance().map_err(|_| SSTableDataCorruption)? {
            blocks.push(BlockMeta {
                last_key_or_greater: reader.key().to_vec(),
                block_addr: reader.value().clone(),
            });
        }
        Ok(SSTableIndex { blocks })
    }
    /// Get the [`BlockAddr`] of the requested block.
    pub(crate) fn get_block(&self, block_id: usize) -> Option<BlockAddr> {
        self.blocks
            .get(block_id)
            .map(|block_meta| block_meta.block_addr.clone())
    }
    /// Get the block id of the block that would contain `key`.
    ///
    /// Returns None if `key` is lexicographically after the last key recorded.
    pub(crate) fn locate_with_key(&self, key: &[u8]) -> Option<usize> {
        let pos = self
            .blocks
            .binary_search_by_key(&key, |block| &block.last_key_or_greater);
        match pos {
            Ok(pos) => Some(pos),
            Err(pos) => {
                if pos < self.blocks.len() {
                    Some(pos)
                } else {
                    // after end of last block: no block matches
                    None
                }
            }
        }
    }
    /// Get the [`BlockAddr`] of the block that would contain `key`.
    ///
    /// Returns None if `key` is lexicographically after the last key recorded.
    pub fn get_block_with_key(&self, key: &[u8]) -> Option<BlockAddr> {
        self.locate_with_key(key).and_then(|id| self.get_block(id))
    }
    pub(crate) fn locate_with_ord(&self, ord: TermOrdinal) -> usize {
        let pos = self
            .blocks
            .binary_search_by_key(&ord, |block| block.block_addr.first_ordinal);
        match pos {
            Ok(pos) => pos,
            // Err(0) can't happen as the sstable starts with ordinal zero
            Err(pos) => pos - 1,
        }
    }
    /// Get the [`BlockAddr`] of the block containing the `ord`-th term.
    pub(crate) fn get_block_with_ord(&self, ord: TermOrdinal) -> BlockAddr {
        // locate_with_ord always returns an index within range
        self.get_block(self.locate_with_ord(ord)).unwrap()
    }
 }
 #[derive(Debug, Clone)]
 pub(crate) struct BlockMeta {
    /// Any byte string that is lexicographically greater or equal to
    /// the last key in the block,
    /// and yet strictly smaller than the first key in the next block.
    pub last_key_or_greater: Vec<u8>,
    pub block_addr: BlockAddr,
 }
 /// SSTable representing an index
 ///
 /// `last_key_or_greater` is used as the key, the value contains the
 /// length and first ordinal of each block. The start offset is implicitly
 /// obtained from lengths.
 struct IndexSSTable;
 impl SSTable for IndexSSTable {
    type Value = BlockAddr;
    type ValueReader = crate::value::index::IndexValueReader;
    type ValueWriter = crate::value::index::IndexValueWriter;
 }
--- a/sstable/src/sstable_index_v3.rs
+++ b/sstable/src/sstable_index_v3.rs
@@ -0,0 +1,826 @@
 use std::io::{self, Read, Write};
 use std::ops::Range;
 use std::sync::Arc;
 use common::{BinarySerializable, FixedSize, OwnedBytes};
 use tantivy_bitpacker::{compute_num_bits, BitPacker};
 use tantivy_fst::raw::Fst;
 use tantivy_fst::{IntoStreamer, Map, MapBuilder, Streamer};
 use crate::{common_prefix_len, SSTableDataCorruption, TermOrdinal};
 #[derive(Debug, Clone)]
 pub enum SSTableIndex {
    V2(crate::sstable_index_v2::SSTableIndex),
    V3(SSTableIndexV3),
    V3Empty(SSTableIndexV3Empty),
 }
 impl SSTableIndex {
    /// Get the [`BlockAddr`] of the requested block.
    pub(crate) fn get_block(&self, block_id: u64) -> Option<BlockAddr> {
        match self {
            SSTableIndex::V2(v2_index) => v2_index.get_block(block_id as usize),
            SSTableIndex::V3(v3_index) => v3_index.get_block(block_id),
            SSTableIndex::V3Empty(v3_empty) => v3_empty.get_block(block_id),
        }
    }
    /// Get the block id of the block that would contain `key`.
    ///
    /// Returns None if `key` is lexicographically after the last key recorded.
    pub(crate) fn locate_with_key(&self, key: &[u8]) -> Option<u64> {
        match self {
            SSTableIndex::V2(v2_index) => v2_index.locate_with_key(key).map(|i| i as u64),
            SSTableIndex::V3(v3_index) => v3_index.locate_with_key(key),
            SSTableIndex::V3Empty(v3_empty) => v3_empty.locate_with_key(key),
        }
    }
    /// Get the [`BlockAddr`] of the block that would contain `key`.
    ///
    /// Returns None if `key` is lexicographically after the last key recorded.
    pub fn get_block_with_key(&self, key: &[u8]) -> Option<BlockAddr> {
        match self {
            SSTableIndex::V2(v2_index) => v2_index.get_block_with_key(key),
            SSTableIndex::V3(v3_index) => v3_index.get_block_with_key(key),
            SSTableIndex::V3Empty(v3_empty) => v3_empty.get_block_with_key(key),
        }
    }
    pub(crate) fn locate_with_ord(&self, ord: TermOrdinal) -> u64 {
        match self {
            SSTableIndex::V2(v2_index) => v2_index.locate_with_ord(ord) as u64,
            SSTableIndex::V3(v3_index) => v3_index.locate_with_ord(ord),
            SSTableIndex::V3Empty(v3_empty) => v3_empty.locate_with_ord(ord),
        }
    }
    /// Get the [`BlockAddr`] of the block containing the `ord`-th term.
    pub(crate) fn get_block_with_ord(&self, ord: TermOrdinal) -> BlockAddr {
        match self {
            SSTableIndex::V2(v2_index) => v2_index.get_block_with_ord(ord),
            SSTableIndex::V3(v3_index) => v3_index.get_block_with_ord(ord),
            SSTableIndex::V3Empty(v3_empty) => v3_empty.get_block_with_ord(ord),
        }
    }
 }
 #[derive(Debug, Clone)]
 pub struct SSTableIndexV3 {
    fst_index: Arc<Map<OwnedBytes>>,
    block_addr_store: BlockAddrStore,
 }
 impl SSTableIndexV3 {
    /// Load an index from its binary representation
    pub fn load(
        data: OwnedBytes,
        fst_length: u64,
    ) -> Result<SSTableIndexV3, SSTableDataCorruption> {
        let (fst_slice, block_addr_store_slice) = data.split(fst_length as usize);
        let fst_index = Fst::new(fst_slice)
            .map_err(|_| SSTableDataCorruption)?
            .into();
        let block_addr_store =
            BlockAddrStore::open(block_addr_store_slice).map_err(|_| SSTableDataCorruption)?;
        Ok(SSTableIndexV3 {
            fst_index: Arc::new(fst_index),
            block_addr_store,
        })
    }
    /// Get the [`BlockAddr`] of the requested block.
    pub(crate) fn get_block(&self, block_id: u64) -> Option<BlockAddr> {
        self.block_addr_store.get(block_id)
    }
    /// Get the block id of the block that would contain `key`.
    ///
    /// Returns None if `key` is lexicographically after the last key recorded.
    pub(crate) fn locate_with_key(&self, key: &[u8]) -> Option<u64> {
        self.fst_index
            .range()
            .ge(key)
            .into_stream()
            .next()
            .map(|(_key, id)| id)
    }
    /// Get the [`BlockAddr`] of the block that would contain `key`.
    ///
    /// Returns None if `key` is lexicographically after the last key recorded.
    pub fn get_block_with_key(&self, key: &[u8]) -> Option<BlockAddr> {
        self.locate_with_key(key).and_then(|id| self.get_block(id))
    }
    pub(crate) fn locate_with_ord(&self, ord: TermOrdinal) -> u64 {
        self.block_addr_store.binary_search_ord(ord).0
    }
    /// Get the [`BlockAddr`] of the block containing the `ord`-th term.
    pub(crate) fn get_block_with_ord(&self, ord: TermOrdinal) -> BlockAddr {
        self.block_addr_store.binary_search_ord(ord).1
    }
 }
 #[derive(Debug, Clone)]
 pub struct SSTableIndexV3Empty {
    block_addr: BlockAddr,
 }
 impl SSTableIndexV3Empty {
    pub fn load(index_start_pos: usize) -> SSTableIndexV3Empty {
        SSTableIndexV3Empty {
            block_addr: BlockAddr {
                first_ordinal: 0,
                byte_range: 0..index_start_pos,
            },
        }
    }
    /// Get the [`BlockAddr`] of the requested block.
    pub(crate) fn get_block(&self, _block_id: u64) -> Option<BlockAddr> {
        Some(self.block_addr.clone())
    }
    /// Get the block id of the block that would contain `key`.
    ///
    /// Returns None if `key` is lexicographically after the last key recorded.
    pub(crate) fn locate_with_key(&self, _key: &[u8]) -> Option<u64> {
        Some(0)
    }
    /// Get the [`BlockAddr`] of the block that would contain `key`.
    ///
    /// Returns None if `key` is lexicographically after the last key recorded.
    pub fn get_block_with_key(&self, _key: &[u8]) -> Option<BlockAddr> {
        Some(self.block_addr.clone())
    }
    pub(crate) fn locate_with_ord(&self, _ord: TermOrdinal) -> u64 {
        0
    }
    /// Get the [`BlockAddr`] of the block containing the `ord`-th term.
    pub(crate) fn get_block_with_ord(&self, _ord: TermOrdinal) -> BlockAddr {
        self.block_addr.clone()
    }
 }
 #[derive(Clone, Eq, PartialEq, Debug)]
 pub struct BlockAddr {
    pub first_ordinal: u64,
    pub byte_range: Range<usize>,
 }
 impl BlockAddr {
    fn to_block_start(&self) -> BlockStartAddr {
        BlockStartAddr {
            first_ordinal: self.first_ordinal,
            byte_range_start: self.byte_range.start,
        }
    }
 }
 #[derive(Debug, Clone, PartialEq, Eq)]
 struct BlockStartAddr {
    first_ordinal: u64,
    byte_range_start: usize,
 }
 impl BlockStartAddr {
    fn to_block_addr(&self, byte_range_end: usize) -> BlockAddr {
        BlockAddr {
            first_ordinal: self.first_ordinal,
            byte_range: self.byte_range_start..byte_range_end,
        }
    }
 }
 #[derive(Debug, Clone)]
 pub(crate) struct BlockMeta {
    /// Any byte string that is lexicographically greater or equal to
    /// the last key in the block,
    /// and yet strictly smaller than the first key in the next block.
    pub last_key_or_greater: Vec<u8>,
    pub block_addr: BlockAddr,
 }
 impl BinarySerializable for BlockStartAddr {
    fn serialize<W: Write + ?Sized>(&self, writer: &mut W) -> io::Result<()> {
        let start = self.byte_range_start as u64;
        start.serialize(writer)?;
        self.first_ordinal.serialize(writer)
    }
    fn deserialize<R: Read>(reader: &mut R) -> io::Result<Self> {
        let byte_range_start = u64::deserialize(reader)? as usize;
        let first_ordinal = u64::deserialize(reader)?;
        Ok(BlockStartAddr {
            first_ordinal,
            byte_range_start,
        })
    }
    // Provided method
    fn num_bytes(&self) -> u64 {
        BlockStartAddr::SIZE_IN_BYTES as u64
    }
 }
 impl FixedSize for BlockStartAddr {
    const SIZE_IN_BYTES: usize = 2 * u64::SIZE_IN_BYTES;
 }
 /// Given that left < right,
 /// mutates `left into a shorter byte string left'` that
 /// matches `left <= left' < right`.
 fn find_shorter_str_in_between(left: &mut Vec<u8>, right: &[u8]) {
    assert!(&left[..] < right);
    let common_len = common_prefix_len(left, right);
    if left.len() == common_len {
        return;
    }
    // It is possible to do one character shorter in some case,
    // but it is not worth the extra complexity
    for pos in (common_len + 1)..left.len() {
        if left[pos] != u8::MAX {
            left[pos] += 1;
            left.truncate(pos + 1);
            return;
        }
    }
 }
 #[derive(Default)]
 pub struct SSTableIndexBuilder {
    blocks: Vec<BlockMeta>,
 }
 impl SSTableIndexBuilder {
    /// In order to make the index as light as possible, we
    /// try to find a shorter alternative to the last key of the last block
    /// that is still smaller than the next key.
    pub(crate) fn shorten_last_block_key_given_next_key(&mut self, next_key: &[u8]) {
        if let Some(last_block) = self.blocks.last_mut() {
            find_shorter_str_in_between(&mut last_block.last_key_or_greater, next_key);
        }
    }
    pub fn add_block(&mut self, last_key: &[u8], byte_range: Range<usize>, first_ordinal: u64) {
        self.blocks.push(BlockMeta {
            last_key_or_greater: last_key.to_vec(),
            block_addr: BlockAddr {
                byte_range,
                first_ordinal,
            },
        })
    }
    pub fn serialize<W: std::io::Write>(&self, wrt: W) -> io::Result<u64> {
        if self.blocks.len() <= 1 {
            return Ok(0);
        }
        let counting_writer = common::CountingWriter::wrap(wrt);
        let mut map_builder = MapBuilder::new(counting_writer).map_err(fst_error_to_io_error)?;
        for (i, block) in self.blocks.iter().enumerate() {
            map_builder
                .insert(&block.last_key_or_greater, i as u64)
                .map_err(fst_error_to_io_error)?;
        }
        let counting_writer = map_builder.into_inner().map_err(fst_error_to_io_error)?;
        let written_bytes = counting_writer.written_bytes();
        let mut wrt = counting_writer.finish();
        let mut block_store_writer = BlockAddrStoreWriter::new();
        for block in &self.blocks {
            block_store_writer.write_block_meta(block.block_addr.clone())?;
        }
        block_store_writer.serialize(&mut wrt)?;
        Ok(written_bytes)
    }
 }
 fn fst_error_to_io_error(error: tantivy_fst::Error) -> io::Error {
    match error {
        tantivy_fst::Error::Fst(fst_error) => io::Error::new(io::ErrorKind::Other, fst_error),
        tantivy_fst::Error::Io(ioerror) => ioerror,
    }
 }
 const STORE_BLOCK_LEN: usize = 128;
 #[derive(Debug)]
 struct BlockAddrBlockMetadata {
    offset: u64,
    ref_block_addr: BlockStartAddr,
    range_start_slope: u32,
    first_ordinal_slope: u32,
    range_start_nbits: u8,
    first_ordinal_nbits: u8,
    block_len: u16,
    // these fields are computed on deserialization, and not stored
    range_shift: i64,
    ordinal_shift: i64,
 }
 impl BlockAddrBlockMetadata {
    fn num_bits(&self) -> u8 {
        self.first_ordinal_nbits + self.range_start_nbits
    }
    fn deserialize_block_addr(&self, data: &[u8], inner_offset: usize) -> Option<BlockAddr> {
        if inner_offset == 0 {
            let range_end = self.ref_block_addr.byte_range_start
                + extract_bits(data, 0, self.range_start_nbits) as usize
                + self.range_start_slope as usize
                - self.range_shift as usize;
            return Some(self.ref_block_addr.to_block_addr(range_end));
        }
        let inner_offset = inner_offset - 1;
        if inner_offset >= self.block_len as usize {
            return None;
        }
        let num_bits = self.num_bits() as usize;
        let range_start_addr = num_bits * inner_offset;
        let ordinal_addr = range_start_addr + self.range_start_nbits as usize;
        let range_end_addr = range_start_addr + num_bits;
        if (range_end_addr + self.range_start_nbits as usize + 7) / 8 > data.len() {
            return None;
        }
        let range_start = self.ref_block_addr.byte_range_start
            + extract_bits(data, range_start_addr, self.range_start_nbits) as usize
            + self.range_start_slope as usize * (inner_offset + 1)
            - self.range_shift as usize;
        let first_ordinal = self.ref_block_addr.first_ordinal
            + extract_bits(data, ordinal_addr, self.first_ordinal_nbits)
            + self.first_ordinal_slope as u64 * (inner_offset + 1) as u64
            - self.ordinal_shift as u64;
        let range_end = self.ref_block_addr.byte_range_start
            + extract_bits(data, range_end_addr, self.range_start_nbits) as usize
            + self.range_start_slope as usize * (inner_offset + 2)
            - self.range_shift as usize;
        Some(BlockAddr {
            first_ordinal,
            byte_range: range_start..range_end,
        })
    }
    fn bisect_for_ord(&self, data: &[u8], target_ord: TermOrdinal) -> (u64, BlockAddr) {
        let inner_target_ord = target_ord - self.ref_block_addr.first_ordinal;
        let num_bits = self.num_bits() as usize;
        let range_start_nbits = self.range_start_nbits as usize;
        let get_ord = |index| {
            extract_bits(
                data,
                num_bits * index as usize + range_start_nbits,
                self.first_ordinal_nbits,
            ) + self.first_ordinal_slope as u64 * (index + 1)
                - self.ordinal_shift as u64
        };
        let inner_offset = match binary_search(self.block_len as u64, |index| {
            get_ord(index).cmp(&inner_target_ord)
        }) {
            Ok(inner_offset) => inner_offset + 1,
            Err(inner_offset) => inner_offset,
        };
        // we can unwrap because inner_offset <= self.block_len
        (
            inner_offset,
            self.deserialize_block_addr(data, inner_offset as usize)
                .unwrap(),
        )
    }
 }
 // TODO move this function to tantivy_common?
 #[inline(always)]
 fn extract_bits(data: &[u8], addr_bits: usize, num_bits: u8) -> u64 {
    assert!(num_bits <= 56);
    let addr_byte = addr_bits / 8;
    let bit_shift = (addr_bits % 8) as u64;
    let val_unshifted_unmasked: u64 = if data.len() >= addr_byte + 8 {
        let b = data[addr_byte..addr_byte + 8].try_into().unwrap();
        u64::from_le_bytes(b)
    } else {
        // the buffer is not large enough.
        // Let's copy the few remaining bytes to a 8 byte buffer
        // padded with 0s.
        let mut buf = [0u8; 8];
        let data_to_copy = &data[addr_byte..];
        let nbytes = data_to_copy.len();
        buf[..nbytes].copy_from_slice(data_to_copy);
        u64::from_le_bytes(buf)
    };
    let val_shifted_unmasked = val_unshifted_unmasked >> bit_shift;
    let mask = (1u64 << u64::from(num_bits)) - 1;
    val_shifted_unmasked & mask
 }
 impl BinarySerializable for BlockAddrBlockMetadata {
    fn serialize<W: Write + ?Sized>(&self, write: &mut W) -> io::Result<()> {
        self.offset.serialize(write)?;
        self.ref_block_addr.serialize(write)?;
        self.range_start_slope.serialize(write)?;
        self.first_ordinal_slope.serialize(write)?;
        write.write_all(&[self.first_ordinal_nbits, self.range_start_nbits])?;
        self.block_len.serialize(write)?;
        self.num_bits();
        Ok(())
    }
    fn deserialize<R: Read>(reader: &mut R) -> io::Result<Self> {
        let offset = u64::deserialize(reader)?;
        let ref_block_addr = BlockStartAddr::deserialize(reader)?;
        let range_start_slope = u32::deserialize(reader)?;
        let first_ordinal_slope = u32::deserialize(reader)?;
        let mut buffer = [0u8; 2];
        reader.read_exact(&mut buffer)?;
        let first_ordinal_nbits = buffer[0];
        let range_start_nbits = buffer[1];
        let block_len = u16::deserialize(reader)?;
        Ok(BlockAddrBlockMetadata {
            offset,
            ref_block_addr,
            range_start_slope,
            first_ordinal_slope,
            range_start_nbits,
            first_ordinal_nbits,
            block_len,
            range_shift: 1 << (range_start_nbits - 1),
            ordinal_shift: 1 << (first_ordinal_nbits - 1),
        })
    }
 }
 impl FixedSize for BlockAddrBlockMetadata {
    const SIZE_IN_BYTES: usize = u64::SIZE_IN_BYTES
        + BlockStartAddr::SIZE_IN_BYTES
        + 2 * u32::SIZE_IN_BYTES
        + 2 * u8::SIZE_IN_BYTES
        + u16::SIZE_IN_BYTES;
 }
 #[derive(Debug, Clone)]
 struct BlockAddrStore {
    block_meta_bytes: OwnedBytes,
    addr_bytes: OwnedBytes,
 }
 impl BlockAddrStore {
    fn open(term_info_store_file: OwnedBytes) -> io::Result<BlockAddrStore> {
        let (mut len_slice, main_slice) = term_info_store_file.split(8);
        let len = u64::deserialize(&mut len_slice)? as usize;
        let (block_meta_bytes, addr_bytes) = main_slice.split(len);
        Ok(BlockAddrStore {
            block_meta_bytes,
            addr_bytes,
        })
    }
    fn get_block_meta(&self, store_block_id: usize) -> Option<BlockAddrBlockMetadata> {
        let mut block_data: &[u8] = self
            .block_meta_bytes
            .get(store_block_id * BlockAddrBlockMetadata::SIZE_IN_BYTES..)?;
        BlockAddrBlockMetadata::deserialize(&mut block_data).ok()
    }
    fn get(&self, block_id: u64) -> Option<BlockAddr> {
        let store_block_id = (block_id as usize) / STORE_BLOCK_LEN;
        let inner_offset = (block_id as usize) % STORE_BLOCK_LEN;
        let block_addr_block_data = self.get_block_meta(store_block_id)?;
        block_addr_block_data.deserialize_block_addr(
            &self.addr_bytes[block_addr_block_data.offset as usize..],
            inner_offset,
        )
    }
    fn binary_search_ord(&self, ord: TermOrdinal) -> (u64, BlockAddr) {
        let max_block =
            (self.block_meta_bytes.len() / BlockAddrBlockMetadata::SIZE_IN_BYTES) as u64;
        let get_first_ordinal = |block_id| {
            // we can unwrap because block_id < max_block
            self.get(block_id * STORE_BLOCK_LEN as u64)
                .unwrap()
                .first_ordinal
        };
        let store_block_id =
            binary_search(max_block, |block_id| get_first_ordinal(block_id).cmp(&ord));
        let store_block_id = match store_block_id {
            Ok(store_block_id) => {
                let block_id = store_block_id * STORE_BLOCK_LEN as u64;
                // we can unwrap because store_block_id < max_block
                return (block_id, self.get(block_id).unwrap());
            }
            Err(store_block_id) => store_block_id - 1,
        };
        // we can unwrap because store_block_id < max_block
        let block_addr_block_data = self.get_block_meta(store_block_id as usize).unwrap();
        let (inner_offset, block_addr) = block_addr_block_data.bisect_for_ord(
            &self.addr_bytes[block_addr_block_data.offset as usize..],
            ord,
        );
        (
            store_block_id * STORE_BLOCK_LEN as u64 + inner_offset,
            block_addr,
        )
    }
 }
 fn binary_search(max: u64, cmp_fn: impl Fn(u64) -> std::cmp::Ordering) -> Result<u64, u64> {
    use std::cmp::Ordering::*;
    let mut size = max;
    let mut left = 0;
    let mut right = size;
    while left < right {
        let mid = left + size / 2;
        let cmp = cmp_fn(mid);
        if cmp == Less {
            left = mid + 1;
        } else if cmp == Greater {
            right = mid;
        } else {
            return Ok(mid);
        }
        size = right - left;
    }
    Err(left)
 }
 struct BlockAddrStoreWriter {
    buffer_block_metas: Vec<u8>,
    buffer_addrs: Vec<u8>,
    block_addrs: Vec<BlockAddr>,
 }
 impl BlockAddrStoreWriter {
    fn new() -> Self {
        BlockAddrStoreWriter {
            buffer_block_metas: Vec::new(),
            buffer_addrs: Vec::new(),
            block_addrs: Vec::with_capacity(STORE_BLOCK_LEN),
        }
    }
    fn flush_block(&mut self) -> io::Result<()> {
        if self.block_addrs.is_empty() {
            return Ok(());
        }
        let ref_block_addr = self.block_addrs[0].clone();
        for block_addr in &mut self.block_addrs {
            block_addr.byte_range.start -= ref_block_addr.byte_range.start;
            block_addr.first_ordinal -= ref_block_addr.first_ordinal;
        }
        // we are only called if block_addrs is not empty
        let mut last_block_addr = self.block_addrs.last().unwrap().clone();
        last_block_addr.byte_range.end -= ref_block_addr.byte_range.start;
        // we skip(1), so we never give an index of 0 to find_best_slope
        let (range_start_slope, range_start_nbits) = find_best_slope(
            self.block_addrs
                .iter()
                .map(|block| block.byte_range.start as u64)
                .chain(std::iter::once(last_block_addr.byte_range.end as u64))
                .enumerate()
                .skip(1),
        );
        // we skip(1), so we never give an index of 0 to find_best_slope
        let (first_ordinal_slope, first_ordinal_nbits) = find_best_slope(
            self.block_addrs
                .iter()
                .map(|block| block.first_ordinal)
                .enumerate()
                .skip(1),
        );
        let range_shift = 1 << (range_start_nbits - 1);
        let ordinal_shift = 1 << (first_ordinal_nbits - 1);
        let block_addr_block_meta = BlockAddrBlockMetadata {
            offset: self.buffer_addrs.len() as u64,
            ref_block_addr: ref_block_addr.to_block_start(),
            range_start_slope,
            first_ordinal_slope,
            range_start_nbits,
            first_ordinal_nbits,
            block_len: self.block_addrs.len() as u16 - 1,
            range_shift,
            ordinal_shift,
        };
        block_addr_block_meta.serialize(&mut self.buffer_block_metas)?;
        let mut bit_packer = BitPacker::new();
        for (i, block_addr) in self.block_addrs.iter().enumerate().skip(1) {
            let range_pred = (range_start_slope as usize * i) as i64;
            bit_packer.write(
                (block_addr.byte_range.start as i64 - range_pred + range_shift) as u64,
                range_start_nbits,
                &mut self.buffer_addrs,
            )?;
            let first_ordinal_pred = (first_ordinal_slope as u64 * i as u64) as i64;
            bit_packer.write(
                (block_addr.first_ordinal as i64 - first_ordinal_pred + ordinal_shift) as u64,
                first_ordinal_nbits,
                &mut self.buffer_addrs,
            )?;
        }
        let range_pred = (range_start_slope as usize * self.block_addrs.len()) as i64;
        bit_packer.write(
            (last_block_addr.byte_range.end as i64 - range_pred + range_shift) as u64,
            range_start_nbits,
            &mut self.buffer_addrs,
        )?;
        bit_packer.flush(&mut self.buffer_addrs)?;
        self.block_addrs.clear();
        Ok(())
    }
    fn write_block_meta(&mut self, block_addr: BlockAddr) -> io::Result<()> {
        self.block_addrs.push(block_addr);
        if self.block_addrs.len() >= STORE_BLOCK_LEN {
            self.flush_block()?;
        }
        Ok(())
    }
    fn serialize<W: std::io::Write>(&mut self, wrt: &mut W) -> io::Result<()> {
        self.flush_block()?;
        let len = self.buffer_block_metas.len() as u64;
        len.serialize(wrt)?;
        wrt.write_all(&self.buffer_block_metas)?;
        wrt.write_all(&self.buffer_addrs)?;
        Ok(())
    }
 }
 /// Given an iterator over (index, value), returns the slope, and number of bits needed to
 /// represente the error to a prediction made by this slope.
 ///
 /// The iterator may be empty, but all indexes in it must be non-zero.
 fn find_best_slope(elements: impl Iterator<Item = (usize, u64)> + Clone) -> (u32, u8) {
    let slope_iterator = elements.clone();
    let derivation_iterator = elements;
    let mut min_slope_idx = 1;
    let mut min_slope_val = 0;
    let mut min_slope = u32::MAX;
    let mut max_slope_idx = 1;
    let mut max_slope_val = 0;
    let mut max_slope = 0;
    for (index, value) in slope_iterator {
        let slope = (value / index as u64) as u32;
        if slope <= min_slope {
            min_slope = slope;
            min_slope_idx = index;
            min_slope_val = value;
        }
        if slope >= max_slope {
            max_slope = slope;
            max_slope_idx = index;
            max_slope_val = value;
        }
    }
    // above is an heuristic giving the "highest" and "lowest" point. It's imperfect in that in that
    // a point that appear earlier might have a high slope derivation, but a smaller absolute
    // derivation than a latter point.
    // The actual best values can be obtained by using the symplex method, but the improvement is
    // likely minimal, and computation is way more complexe.
    //
    // Assuming these point are the furthest up and down, we find the slope that would cause the
    // same positive derivation for the highest as negative derivation for the lowest.
    // A is the optimal slope. B is the derivation to the guess
    //
    // 0 = min_slope_val - min_slope_idx * A - B
    // 0 = max_slope_val - max_slope_idx * A + B
    //
    // 0 = min_slope_val + max_slope_val - (min_slope_idx + max_slope_idx) * A
    // (min_slope_val + max_slope_val) / (min_slope_idx + max_slope_idx) = A
    //
    // we actually add some correcting factor to have proper rounding, not truncation.
    let denominator = (min_slope_idx + max_slope_idx) as u64;
    let final_slope = ((min_slope_val + max_slope_val + denominator / 2) / denominator) as u32;
    // we don't solve for B because our choice of point is suboptimal, so it's actually a lower
    // bound and we need to iterate to find the actual worst value.
    let max_derivation: u64 = derivation_iterator
        .map(|(index, value)| (value as i64 - final_slope as i64 * index as i64).unsigned_abs())
        .max()
        .unwrap_or(0);
    (final_slope, compute_num_bits(max_derivation) + 1)
 }
 #[cfg(test)]
 mod tests {
    use common::OwnedBytes;
    use super::{BlockAddr, SSTableIndexBuilder, SSTableIndexV3};
    use crate::SSTableDataCorruption;
    #[test]
    fn test_sstable_index() {
        let mut sstable_builder = SSTableIndexBuilder::default();
        sstable_builder.add_block(b"aaa", 10..20, 0u64);
        sstable_builder.add_block(b"bbbbbbb", 20..30, 5u64);
        sstable_builder.add_block(b"ccc", 30..40, 10u64);
        sstable_builder.add_block(b"dddd", 40..50, 15u64);
        let mut buffer: Vec<u8> = Vec::new();
        let fst_len = sstable_builder.serialize(&mut buffer).unwrap();
        let buffer = OwnedBytes::new(buffer);
        let sstable_index = SSTableIndexV3::load(buffer, fst_len).unwrap();
        assert_eq!(
            sstable_index.get_block_with_key(b"bbbde"),
            Some(BlockAddr {
                first_ordinal: 10u64,
                byte_range: 30..40
            })
        );
        assert_eq!(sstable_index.locate_with_key(b"aa").unwrap(), 0);
        assert_eq!(sstable_index.locate_with_key(b"aaa").unwrap(), 0);
        assert_eq!(sstable_index.locate_with_key(b"aab").unwrap(), 1);
        assert_eq!(sstable_index.locate_with_key(b"ccc").unwrap(), 2);
        assert!(sstable_index.locate_with_key(b"e").is_none());
        assert_eq!(sstable_index.locate_with_ord(0), 0);
        assert_eq!(sstable_index.locate_with_ord(1), 0);
        assert_eq!(sstable_index.locate_with_ord(4), 0);
        assert_eq!(sstable_index.locate_with_ord(5), 1);
        assert_eq!(sstable_index.locate_with_ord(100), 3);
    }
    #[test]
    fn test_sstable_with_corrupted_data() {
        let mut sstable_builder = SSTableIndexBuilder::default();
        sstable_builder.add_block(b"aaa", 10..20, 0u64);
        sstable_builder.add_block(b"bbbbbbb", 20..30, 5u64);
        sstable_builder.add_block(b"ccc", 30..40, 10u64);
        sstable_builder.add_block(b"dddd", 40..50, 15u64);
        let mut buffer: Vec<u8> = Vec::new();
        let fst_len = sstable_builder.serialize(&mut buffer).unwrap();
        buffer[2] = 9u8;
        let buffer = OwnedBytes::new(buffer);
        let data_corruption_err = SSTableIndexV3::load(buffer, fst_len).err().unwrap();
        assert!(matches!(data_corruption_err, SSTableDataCorruption));
    }
    #[track_caller]
    fn test_find_shorter_str_in_between_aux(left: &[u8], right: &[u8]) {
        let mut left_buf = left.to_vec();
        super::find_shorter_str_in_between(&mut left_buf, right);
        assert!(left_buf.len() <= left.len());
        assert!(left <= &left_buf);
        assert!(&left_buf[..] < right);
    }
    #[test]
    fn test_find_shorter_str_in_between() {
        test_find_shorter_str_in_between_aux(b"", b"hello");
        test_find_shorter_str_in_between_aux(b"abc", b"abcd");
        test_find_shorter_str_in_between_aux(b"abcd", b"abd");
        test_find_shorter_str_in_between_aux(&[0, 0, 0], &[1]);
        test_find_shorter_str_in_between_aux(&[0, 0, 0], &[0, 0, 1]);
        test_find_shorter_str_in_between_aux(&[0, 0, 255, 255, 255, 0u8], &[0, 1]);
    }
    use proptest::prelude::*;
    proptest! {
        #![proptest_config(ProptestConfig::with_cases(100))]
        #[test]
        fn test_proptest_find_shorter_str(left in any::<Vec<u8>>(), right in any::<Vec<u8>>()) {
            if left < right {
                test_find_shorter_str_in_between_aux(&left, &right);
            }
        }
    }
    #[test]
    fn test_find_best_slop() {
        assert_eq!(super::find_best_slope(std::iter::empty()), (0, 1));
        assert_eq!(
            super::find_best_slope(std::iter::once((1, 12345))),
            (12345, 1)
        );
    }
 }
--- a/sstable/src/streamer.rs
+++ b/sstable/src/streamer.rs
@@ -110,7 +110,7 @@ where
            Bound::Included(key) | Bound::Excluded(key) => self
                .term_dict
                .sstable_index
-                .get_block_with_key(key)?
+                .get_block_with_key(key)
                .map(|block| block.first_ordinal)
                .unwrap_or(0),
            Bound::Unbounded => 0,
--- a/sstable/src/value/index.rs
+++ b/sstable/src/value/index.rs
@@ -3,10 +3,6 @@ use std::io;
 use crate::value::{deserialize_vint_u64, ValueReader, ValueWriter};
 use crate::{vint, BlockAddr};
 // TODO define a LazyIndexValueReader?
 // one which keeps state could be useful for ord_to_block fns,
 // one which doesn't at all woud be perfect for term_to_block fns
 // pending bench to asses real impact
 #[derive(Default)]
 pub(crate) struct IndexValueReader {
    vals: Vec<BlockAddr>,
--- a/stacker/src/arena_hashmap.rs
+++ b/stacker/src/arena_hashmap.rs
@@ -1,51 +1,5 @@
 use std::iter::{Cloned, Filter};
 use std::mem;
 use super::{Addr, MemoryArena};
-use crate::fastcpy::fast_short_slice_copy;
+use crate::shared_arena_hashmap::SharedArenaHashMap;
 use crate::memory_arena::store;
 /// Returns the actual memory size in bytes
 /// required to create a table with a given capacity.
 /// required to create a table of size
 pub fn compute_table_memory_size(capacity: usize) -> usize {
    capacity * mem::size_of::<KeyValue>()
 }
 #[cfg(not(feature = "compare_hash_only"))]
 type HashType = u32;
 #[cfg(feature = "compare_hash_only")]
 type HashType = u64;
 /// `KeyValue` is the item stored in the hash table.
 /// The key is actually a `BytesRef` object stored in an external memory arena.
 /// The `value_addr` also points to an address in the memory arena.
 #[derive(Copy, Clone)]
 struct KeyValue {
    pub(crate) key_value_addr: Addr,
    hash: HashType,
 }
 impl Default for KeyValue {
    fn default() -> Self {
        KeyValue {
            key_value_addr: Addr::null_pointer(),
            hash: 0,
        }
    }
 }
 impl KeyValue {
    #[inline]
    fn is_empty(&self) -> bool {
        self.key_value_addr.is_null()
    }
    #[inline]
    fn is_not_empty_ref(&self) -> bool {
        !self.key_value_addr.is_null()
    }
 }
 /// Customized `HashMap` with `&[u8]` keys
 ///
@@ -56,61 +10,13 @@ impl KeyValue {
 /// The quirky API has the benefit of avoiding
 /// the computation of the hash of the key twice,
 /// or copying the key as long as there is no insert.
 pub struct ArenaHashMap {
    table: Vec<KeyValue>,
    pub memory_arena: MemoryArena,
    mask: usize,
    len: usize,
 }
 struct LinearProbing {
    pos: usize,
    mask: usize,
 }
 impl LinearProbing {
    #[inline]
    fn compute(hash: HashType, mask: usize) -> LinearProbing {
        LinearProbing {
            pos: hash as usize,
            mask,
        }
    }
    #[inline]
    fn next_probe(&mut self) -> usize {
        // Not saving the masked version removes a dependency.
        self.pos = self.pos.wrapping_add(1);
        self.pos & self.mask
    }
 }
 type IterNonEmpty<'a> = Filter<Cloned<std::slice::Iter<'a, KeyValue>>, fn(&KeyValue) -> bool>;
 pub struct Iter<'a> {
    hashmap: &'a ArenaHashMap,
    inner: IterNonEmpty<'a>,
 }
 impl<'a> Iterator for Iter<'a> {
    type Item = (&'a [u8], Addr);
    fn next(&mut self) -> Option<Self::Item> {
        self.inner.next().map(move |kv| {
            let (key, offset): (&'a [u8], Addr) = self.hashmap.get_key_value(kv.key_value_addr);
            (key, offset)
        })
    }
 }
 /// Returns the greatest power of two lower or equal to `n`.
 /// Except if n == 0, in that case, return 1.
 ///
-/// # Panics if n == 0
+/// ArenaHashMap is like SharedArenaHashMap but takes ownership
-fn compute_previous_power_of_two(n: usize) -> usize {
+/// of the memory arena. The memory arena stores the serialized
-    assert!(n > 0);
+/// keys and values.
-    let msb = (63u32 - (n as u64).leading_zeros()) as u8;
+pub struct ArenaHashMap {
-    1 << msb
+    shared_arena_hashmap: SharedArenaHashMap,
    pub memory_arena: MemoryArena,
 }
 impl Default for ArenaHashMap {
@@ -121,156 +27,44 @@ impl Default for ArenaHashMap {
 impl ArenaHashMap {
    pub fn with_capacity(table_size: usize) -> ArenaHashMap {
        let table_size_power_of_2 = compute_previous_power_of_two(table_size);
        let memory_arena = MemoryArena::default();
        let table = vec![KeyValue::default(); table_size_power_of_2];
        ArenaHashMap {
-            table,
+            shared_arena_hashmap: SharedArenaHashMap::with_capacity(table_size),
            memory_arena,
            mask: table_size_power_of_2 - 1,
            len: 0,
        }
    }
    #[inline]
    #[cfg(not(feature = "compare_hash_only"))]
    fn get_hash(&self, key: &[u8]) -> HashType {
        murmurhash32::murmurhash2(key)
    }
    #[inline]
    #[cfg(feature = "compare_hash_only")]
    fn get_hash(&self, key: &[u8]) -> HashType {
        /// Since we compare only the hash we need a high quality hash.
        use std::hash::Hasher;
        let mut hasher = ahash::AHasher::default();
        hasher.write(key);
        hasher.finish() as HashType
    }
    #[inline]
    pub fn read<Item: Copy + 'static>(&self, addr: Addr) -> Item {
        self.memory_arena.read(addr)
    }
    #[inline]
    fn probe(&self, hash: HashType) -> LinearProbing {
        LinearProbing::compute(hash, self.mask)
    }
    #[inline]
    pub fn mem_usage(&self) -> usize {
-        self.table.len() * mem::size_of::<KeyValue>() + self.memory_arena.mem_usage()
+        self.shared_arena_hashmap.mem_usage() + self.memory_arena.mem_usage()
    }
    #[inline]
    fn is_saturated(&self) -> bool {
        self.table.len() <= self.len * 2
    }
    #[inline]
    fn get_key_value(&self, addr: Addr) -> (&[u8], Addr) {
        let data = self.memory_arena.slice_from(addr);
        let key_bytes_len_bytes = unsafe { data.get_unchecked(..2) };
        let key_bytes_len = u16::from_le_bytes(key_bytes_len_bytes.try_into().unwrap());
        let key_bytes: &[u8] = unsafe { data.get_unchecked(2..2 + key_bytes_len as usize) };
        (key_bytes, addr.offset(2 + key_bytes_len as u32))
    }
    #[inline]
    #[cfg(not(feature = "compare_hash_only"))]
    fn get_value_addr_if_key_match(&self, target_key: &[u8], addr: Addr) -> Option<Addr> {
        use crate::fastcmp::fast_short_slice_compare;
        let (stored_key, value_addr) = self.get_key_value(addr);
        if fast_short_slice_compare(stored_key, target_key) {
            Some(value_addr)
        } else {
            None
        }
    }
    #[inline]
    #[cfg(feature = "compare_hash_only")]
    fn get_value_addr_if_key_match(&self, _target_key: &[u8], addr: Addr) -> Option<Addr> {
        // For the compare_hash_only feature, it would make sense to store the keys at a different
        // memory location. Here they will just pollute the cache.
        let data = self.memory_arena.slice_from(addr);
        let key_bytes_len_bytes = &data[..2];
        let key_bytes_len = u16::from_le_bytes(key_bytes_len_bytes.try_into().unwrap());
        let value_addr = addr.offset(2 + key_bytes_len as u32);
        Some(value_addr)
    }
    #[inline]
    fn set_bucket(&mut self, hash: HashType, key_value_addr: Addr, bucket: usize) {
        self.len += 1;
        self.table[bucket] = KeyValue {
            key_value_addr,
            hash,
        };
    }
    #[inline]
    pub fn is_empty(&self) -> bool {
-        self.len() == 0
+        self.shared_arena_hashmap.is_empty()
    }
    #[inline]
    pub fn len(&self) -> usize {
-        self.len
+        self.shared_arena_hashmap.len()
    }
    #[inline]
-    pub fn iter(&self) -> Iter<'_> {
+    pub fn iter(&self) -> impl Iterator<Item = (&[u8], Addr)> {
-        Iter {
+        self.shared_arena_hashmap.iter(&self.memory_arena)
            inner: self
                .table
                .iter()
                .cloned()
                .filter(KeyValue::is_not_empty_ref),
            hashmap: self,
        }
    }
    fn resize(&mut self) {
        let new_len = (self.table.len() * 2).max(1 << 13);
        let mask = new_len - 1;
        self.mask = mask;
        let new_table = vec![KeyValue::default(); new_len];
        let old_table = mem::replace(&mut self.table, new_table);
        for key_value in old_table.into_iter().filter(KeyValue::is_not_empty_ref) {
            let mut probe = LinearProbing::compute(key_value.hash, mask);
            loop {
                let bucket = probe.next_probe();
                if self.table[bucket].is_empty() {
                    self.table[bucket] = key_value;
                    break;
                }
            }
        }
    }
    /// Get a value associated to a key.
    #[inline]
    pub fn get<V>(&self, key: &[u8]) -> Option<V>
    where V: Copy + 'static {
-        let hash = self.get_hash(key);
+        self.shared_arena_hashmap.get(key, &self.memory_arena)
        let mut probe = self.probe(hash);
        loop {
            let bucket = probe.next_probe();
            let kv: KeyValue = self.table[bucket];
            if kv.is_empty() {
                return None;
            } else if kv.hash == hash {
                if let Some(val_addr) = self.get_value_addr_if_key_match(key, kv.key_value_addr) {
                    let v = self.memory_arena.read(val_addr);
                    return Some(v);
                }
            }
        }
    }
    /// `update` create a new entry for a given key if it does not exist
@@ -284,45 +78,10 @@ impl ArenaHashMap {
    /// If the key already as an associated value, then it will be passed
    /// `Some(previous_value)`.
    #[inline]
-    pub fn mutate_or_create<V>(&mut self, key: &[u8], mut updater: impl FnMut(Option<V>) -> V)
+    pub fn mutate_or_create<V>(&mut self, key: &[u8], updater: impl FnMut(Option<V>) -> V)
    where V: Copy + 'static {
-        if self.is_saturated() {
+        self.shared_arena_hashmap
-            self.resize();
+            .mutate_or_create(key, &mut self.memory_arena, updater);
        }
        let hash = self.get_hash(key);
        let mut probe = self.probe(hash);
        let mut bucket = probe.next_probe();
        let mut kv: KeyValue = self.table[bucket];
        loop {
            if kv.is_empty() {
                // The key does not exist yet.
                let val = updater(None);
                let num_bytes = std::mem::size_of::<u16>() + key.len() + std::mem::size_of::<V>();
                let key_addr = self.memory_arena.allocate_space(num_bytes);
                {
                    let data = self.memory_arena.slice_mut(key_addr, num_bytes);
                    let key_len_bytes: [u8; 2] = (key.len() as u16).to_le_bytes();
                    data[..2].copy_from_slice(&key_len_bytes);
                    let stop = 2 + key.len();
                    fast_short_slice_copy(key, &mut data[2..stop]);
                    store(&mut data[stop..], val);
                }
                self.set_bucket(hash, key_addr, bucket);
                return;
            }
            if kv.hash == hash {
                if let Some(val_addr) = self.get_value_addr_if_key_match(key, kv.key_value_addr) {
                    let v = self.memory_arena.read(val_addr);
                    let new_v = updater(Some(v));
                    self.memory_arena.write_at(val_addr, new_v);
                    return;
                }
            }
            // This allows fetching the next bucket before the loop jmp
            bucket = probe.next_probe();
            kv = self.table[bucket];
        }
    }
 }
@@ -331,7 +90,7 @@ mod tests {
    use std::collections::HashMap;
-    use super::{compute_previous_power_of_two, ArenaHashMap};
+    use super::ArenaHashMap;
    #[test]
    fn test_hash_map() {
@@ -362,14 +121,6 @@ mod tests {
        assert_eq!(hash_map.get::<u32>(b"abc"), None);
    }
    #[test]
    fn test_compute_previous_power_of_two() {
        assert_eq!(compute_previous_power_of_two(8), 8);
        assert_eq!(compute_previous_power_of_two(9), 8);
        assert_eq!(compute_previous_power_of_two(7), 4);
        assert_eq!(compute_previous_power_of_two(u64::MAX as usize), 1 << 63);
    }
    #[test]
    fn test_many_terms() {
        let mut terms: Vec<String> = (0..20_000).map(|val| val.to_string()).collect();
--- a/stacker/src/lib.rs
+++ b/stacker/src/lib.rs
@@ -9,10 +9,12 @@ mod expull;
 mod fastcmp;
 mod fastcpy;
 mod memory_arena;
 mod shared_arena_hashmap;
-pub use self::arena_hashmap::{compute_table_memory_size, ArenaHashMap};
+pub use self::arena_hashmap::ArenaHashMap;
 pub use self::expull::ExpUnrolledLinkedList;
 pub use self::memory_arena::{Addr, MemoryArena};
 pub use self::shared_arena_hashmap::{compute_table_memory_size, SharedArenaHashMap};
 /// When adding an element in a `ArenaHashMap`, we get a unique id associated to the given key.
 pub type UnorderedId = u32;
--- a/stacker/src/shared_arena_hashmap.rs
+++ b/stacker/src/shared_arena_hashmap.rs
@@ -0,0 +1,420 @@
 use std::iter::{Cloned, Filter};
 use std::mem;
 use super::{Addr, MemoryArena};
 use crate::fastcpy::fast_short_slice_copy;
 use crate::memory_arena::store;
 /// Returns the actual memory size in bytes
 /// required to create a table with a given capacity.
 /// required to create a table of size
 pub fn compute_table_memory_size(capacity: usize) -> usize {
    capacity * mem::size_of::<KeyValue>()
 }
 #[cfg(not(feature = "compare_hash_only"))]
 type HashType = u32;
 #[cfg(feature = "compare_hash_only")]
 type HashType = u64;
 /// `KeyValue` is the item stored in the hash table.
 /// The key is actually a `BytesRef` object stored in an external memory arena.
 /// The `value_addr` also points to an address in the memory arena.
 #[derive(Copy, Clone)]
 struct KeyValue {
    key_value_addr: Addr,
    hash: HashType,
 }
 impl Default for KeyValue {
    fn default() -> Self {
        KeyValue {
            key_value_addr: Addr::null_pointer(),
            hash: 0,
        }
    }
 }
 impl KeyValue {
    #[inline]
    fn is_empty(&self) -> bool {
        self.key_value_addr.is_null()
    }
    #[inline]
    fn is_not_empty_ref(&self) -> bool {
        !self.key_value_addr.is_null()
    }
 }
 /// Customized `HashMap` with `&[u8]` keys
 ///
 /// Its main particularity is that rather than storing its
 /// keys in the heap, keys are stored in a memory arena
 /// inline with the values.
 ///
 /// The quirky API has the benefit of avoiding
 /// the computation of the hash of the key twice,
 /// or copying the key as long as there is no insert.
 ///
 /// SharedArenaHashMap is like ArenaHashMap but gets the memory arena
 /// passed as an argument to the methods.
 /// So one MemoryArena can be shared with multiple SharedArenaHashMap.
 pub struct SharedArenaHashMap {
    table: Vec<KeyValue>,
    mask: usize,
    len: usize,
 }
 struct LinearProbing {
    pos: usize,
    mask: usize,
 }
 impl LinearProbing {
    #[inline]
    fn compute(hash: HashType, mask: usize) -> LinearProbing {
        LinearProbing {
            pos: hash as usize,
            mask,
        }
    }
    #[inline]
    fn next_probe(&mut self) -> usize {
        // Not saving the masked version removes a dependency.
        self.pos = self.pos.wrapping_add(1);
        self.pos & self.mask
    }
 }
 type IterNonEmpty<'a> = Filter<Cloned<std::slice::Iter<'a, KeyValue>>, fn(&KeyValue) -> bool>;
 pub struct Iter<'a> {
    hashmap: &'a SharedArenaHashMap,
    memory_arena: &'a MemoryArena,
    inner: IterNonEmpty<'a>,
 }
 impl<'a> Iterator for Iter<'a> {
    type Item = (&'a [u8], Addr);
    fn next(&mut self) -> Option<Self::Item> {
        self.inner.next().map(move |kv| {
            let (key, offset): (&'a [u8], Addr) = self
                .hashmap
                .get_key_value(kv.key_value_addr, self.memory_arena);
            (key, offset)
        })
    }
 }
 /// Returns the greatest power of two lower or equal to `n`.
 /// Except if n == 0, in that case, return 1.
 ///
 /// # Panics if n == 0
 fn compute_previous_power_of_two(n: usize) -> usize {
    assert!(n > 0);
    let msb = (63u32 - (n as u64).leading_zeros()) as u8;
    1 << msb
 }
 impl Default for SharedArenaHashMap {
    fn default() -> Self {
        SharedArenaHashMap::with_capacity(4)
    }
 }
 impl SharedArenaHashMap {
    pub fn with_capacity(table_size: usize) -> SharedArenaHashMap {
        let table_size_power_of_2 = compute_previous_power_of_two(table_size);
        let table = vec![KeyValue::default(); table_size_power_of_2];
        SharedArenaHashMap {
            table,
            mask: table_size_power_of_2 - 1,
            len: 0,
        }
    }
    #[inline]
    #[cfg(not(feature = "compare_hash_only"))]
    fn get_hash(&self, key: &[u8]) -> HashType {
        murmurhash32::murmurhash2(key)
    }
    #[inline]
    #[cfg(feature = "compare_hash_only")]
    fn get_hash(&self, key: &[u8]) -> HashType {
        /// Since we compare only the hash we need a high quality hash.
        use std::hash::Hasher;
        let mut hasher = ahash::AHasher::default();
        hasher.write(key);
        hasher.finish() as HashType
    }
    #[inline]
    fn probe(&self, hash: HashType) -> LinearProbing {
        LinearProbing::compute(hash, self.mask)
    }
    #[inline]
    pub fn mem_usage(&self) -> usize {
        self.table.len() * mem::size_of::<KeyValue>()
    }
    #[inline]
    fn is_saturated(&self) -> bool {
        self.table.len() <= self.len * 2
    }
    #[inline]
    fn get_key_value<'a>(&'a self, addr: Addr, memory_arena: &'a MemoryArena) -> (&[u8], Addr) {
        let data = memory_arena.slice_from(addr);
        let key_bytes_len_bytes = unsafe { data.get_unchecked(..2) };
        let key_bytes_len = u16::from_le_bytes(key_bytes_len_bytes.try_into().unwrap());
        let key_bytes: &[u8] = unsafe { data.get_unchecked(2..2 + key_bytes_len as usize) };
        (key_bytes, addr.offset(2 + key_bytes_len as u32))
    }
    #[inline]
    #[cfg(not(feature = "compare_hash_only"))]
    fn get_value_addr_if_key_match(
        &self,
        target_key: &[u8],
        addr: Addr,
        memory_arena: &MemoryArena,
    ) -> Option<Addr> {
        use crate::fastcmp::fast_short_slice_compare;
        let (stored_key, value_addr) = self.get_key_value(addr, memory_arena);
        if fast_short_slice_compare(stored_key, target_key) {
            Some(value_addr)
        } else {
            None
        }
    }
    #[inline]
    #[cfg(feature = "compare_hash_only")]
    fn get_value_addr_if_key_match(
        &self,
        _target_key: &[u8],
        addr: Addr,
        memory_arena: &MemoryArena,
    ) -> Option<Addr> {
        // For the compare_hash_only feature, it would make sense to store the keys at a different
        // memory location. Here they will just pollute the cache.
        let data = memory_arena.slice_from(addr);
        let key_bytes_len_bytes = &data[..2];
        let key_bytes_len = u16::from_le_bytes(key_bytes_len_bytes.try_into().unwrap());
        let value_addr = addr.offset(2 + key_bytes_len as u32);
        Some(value_addr)
    }
    #[inline]
    fn set_bucket(&mut self, hash: HashType, key_value_addr: Addr, bucket: usize) {
        self.len += 1;
        self.table[bucket] = KeyValue {
            key_value_addr,
            hash,
        };
    }
    #[inline]
    pub fn is_empty(&self) -> bool {
        self.len() == 0
    }
    #[inline]
    pub fn len(&self) -> usize {
        self.len
    }
    #[inline]
    pub fn iter<'a>(&'a self, memory_arena: &'a MemoryArena) -> Iter<'_> {
        Iter {
            inner: self
                .table
                .iter()
                .cloned()
                .filter(KeyValue::is_not_empty_ref),
            hashmap: self,
            memory_arena,
        }
    }
    fn resize(&mut self) {
        let new_len = (self.table.len() * 2).max(1 << 3);
        let mask = new_len - 1;
        self.mask = mask;
        let new_table = vec![KeyValue::default(); new_len];
        let old_table = mem::replace(&mut self.table, new_table);
        for key_value in old_table.into_iter().filter(KeyValue::is_not_empty_ref) {
            let mut probe = LinearProbing::compute(key_value.hash, mask);
            loop {
                let bucket = probe.next_probe();
                if self.table[bucket].is_empty() {
                    self.table[bucket] = key_value;
                    break;
                }
            }
        }
    }
    /// Get a value associated to a key.
    #[inline]
    pub fn get<V>(&self, key: &[u8], memory_arena: &MemoryArena) -> Option<V>
    where V: Copy + 'static {
        let hash = self.get_hash(key);
        let mut probe = self.probe(hash);
        loop {
            let bucket = probe.next_probe();
            let kv: KeyValue = self.table[bucket];
            if kv.is_empty() {
                return None;
            } else if kv.hash == hash {
                if let Some(val_addr) =
                    self.get_value_addr_if_key_match(key, kv.key_value_addr, memory_arena)
                {
                    let v = memory_arena.read(val_addr);
                    return Some(v);
                }
            }
        }
    }
    /// `update` create a new entry for a given key if it does not exist
    /// or updates the existing entry.
    ///
    /// The actual logic for this update is define in the `updater`
    /// argument.
    ///
    /// If the key is not present, `updater` will receive `None` and
    /// will be in charge of returning a default value.
    /// If the key already as an associated value, then it will be passed
    /// `Some(previous_value)`.
    #[inline]
    pub fn mutate_or_create<V>(
        &mut self,
        key: &[u8],
        memory_arena: &mut MemoryArena,
        mut updater: impl FnMut(Option<V>) -> V,
    ) -> V
    where
        V: Copy + 'static,
    {
        if self.is_saturated() {
            self.resize();
        }
        let hash = self.get_hash(key);
        let mut probe = self.probe(hash);
        let mut bucket = probe.next_probe();
        let mut kv: KeyValue = self.table[bucket];
        loop {
            if kv.is_empty() {
                // The key does not exist yet.
                let val = updater(None);
                let num_bytes = std::mem::size_of::<u16>() + key.len() + std::mem::size_of::<V>();
                let key_addr = memory_arena.allocate_space(num_bytes);
                {
                    let data = memory_arena.slice_mut(key_addr, num_bytes);
                    let key_len_bytes: [u8; 2] = (key.len() as u16).to_le_bytes();
                    data[..2].copy_from_slice(&key_len_bytes);
                    let stop = 2 + key.len();
                    fast_short_slice_copy(key, &mut data[2..stop]);
                    store(&mut data[stop..], val);
                }
                self.set_bucket(hash, key_addr, bucket);
                return val;
            }
            if kv.hash == hash {
                if let Some(val_addr) =
                    self.get_value_addr_if_key_match(key, kv.key_value_addr, memory_arena)
                {
                    let v = memory_arena.read(val_addr);
                    let new_v = updater(Some(v));
                    memory_arena.write_at(val_addr, new_v);
                    return new_v;
                }
            }
            // This allows fetching the next bucket before the loop jmp
            bucket = probe.next_probe();
            kv = self.table[bucket];
        }
    }
 }
 #[cfg(test)]
 mod tests {
    use std::collections::HashMap;
    use super::{compute_previous_power_of_two, SharedArenaHashMap};
    use crate::MemoryArena;
    #[test]
    fn test_hash_map() {
        let mut memory_arena = MemoryArena::default();
        let mut hash_map: SharedArenaHashMap = SharedArenaHashMap::default();
        hash_map.mutate_or_create(b"abc", &mut memory_arena, |opt_val: Option<u32>| {
            assert_eq!(opt_val, None);
            3u32
        });
        hash_map.mutate_or_create(b"abcd", &mut memory_arena, |opt_val: Option<u32>| {
            assert_eq!(opt_val, None);
            4u32
        });
        hash_map.mutate_or_create(b"abc", &mut memory_arena, |opt_val: Option<u32>| {
            assert_eq!(opt_val, Some(3u32));
            5u32
        });
        let mut vanilla_hash_map = HashMap::new();
        let iter_values = hash_map.iter(&memory_arena);
        for (key, addr) in iter_values {
            let val: u32 = memory_arena.read(addr);
            vanilla_hash_map.insert(key.to_owned(), val);
        }
        assert_eq!(vanilla_hash_map.len(), 2);
    }
    #[test]
    fn test_empty_hashmap() {
        let memory_arena = MemoryArena::default();
        let hash_map: SharedArenaHashMap = SharedArenaHashMap::default();
        assert_eq!(hash_map.get::<u32>(b"abc", &memory_arena), None);
    }
    #[test]
    fn test_compute_previous_power_of_two() {
        assert_eq!(compute_previous_power_of_two(8), 8);
        assert_eq!(compute_previous_power_of_two(9), 8);
        assert_eq!(compute_previous_power_of_two(7), 4);
        assert_eq!(compute_previous_power_of_two(u64::MAX as usize), 1 << 63);
    }
    #[test]
    fn test_many_terms() {
        let mut memory_arena = MemoryArena::default();
        let mut terms: Vec<String> = (0..20_000).map(|val| val.to_string()).collect();
        let mut hash_map: SharedArenaHashMap = SharedArenaHashMap::default();
        for term in terms.iter() {
            hash_map.mutate_or_create(
                term.as_bytes(),
                &mut memory_arena,
                |_opt_val: Option<u32>| 5u32,
            );
        }
        let mut terms_back: Vec<String> = hash_map
            .iter(&memory_arena)
            .map(|(bytes, _)| String::from_utf8(bytes.to_vec()).unwrap())
            .collect();
        terms_back.sort();
        terms.sort();
        for pos in 0..terms.len() {
            assert_eq!(terms[pos], terms_back[pos]);
        }
    }
 }
Author	SHA1	Message	Date
Adam Reichold	d1177fe22f	Opportunistically seed forked block caches from current one.	2023-12-11 15:24:03 +01:00
Adam Reichold	0361a1edaa	Do not expose StoreReader::fork_cache and ::cache_key in the public API if only Searcher::docs_async uses them.	2023-12-11 15:24:03 +01:00
Adam Reichold	3d48ce80c5	Add Searcher::docs_async which efficently fetches multiple documents by group them by segment and block.	2023-12-11 15:24:03 +01:00
Adam Reichold	49a913f6f8	Expose which documents cache together to user code.	2023-12-11 11:01:17 +01:00
Adam Reichold	19a773da47	Allow cheaply cloning a StoreReader to enable user control over block cache usage.	2023-12-11 10:28:50 +01:00
PSeitz	bff7c58497	improve indexing benchmark (#2275 )	2023-12-11 09:04:42 +01:00
trinity-1686a	9ebc5ed053	use fst for sstable index (#2268 ) * read path for new fst based index * implement BlockAddrStoreWriter * extract slop/derivation computation * use better linear approximator and allow negative correction to approximator * document format and reorder some fields * optimize single block sstable size * plug backward compat	2023-12-04 15:13:15 +01:00
PSeitz	0b56c88e69	Revert "Preparing for 0.21.2 release." (#2258 ) * Revert "Preparing for 0.21.2 release. (#2256)" This reverts commit `9caab45136`. * bump version to 0.21.1 * set version to 0.22.0-dev	2023-12-01 13:46:12 +01:00
PSeitz	24841f0b2a	update bitpacker dep (#2269 )	2023-12-01 13:45:52 +01:00
PSeitz	1a9fc10be9	add fields_metadata to SegmentReader, add columnar docs (#2222 ) * add fields_metadata to SegmentReader, add columnar docs * use schema to resolve field, add test * normalize paths * merge for FieldsMetadata, add fields_metadata on Index * Update src/core/segment_reader.rs Co-authored-by: Paul Masurel <paul@quickwit.io> * merge code paths * add Hash * move function oustide --------- Co-authored-by: Paul Masurel <paul@quickwit.io>	2023-11-22 12:29:53 +01:00
PSeitz	07573a7f19	update fst (#2267 ) update fst to 0.5 (deduplicates regex-syntax in the dep tree) deps cleanup	2023-11-21 16:06:57 +01:00
BlackHoleFox	daad2dc151	Take string references instead of owned values building Facet paths (#2265 )	2023-11-20 09:40:44 +01:00
PSeitz	054f49dc31	support escaped dot, add agg test (#2250 ) add agg test for nested JSON allow escaping of dot	2023-11-20 03:00:57 +01:00
PSeitz	47009ed2d3	remove unused deps (#2264 ) found with cargo machete remove pprof (doesn't work)	2023-11-20 02:59:59 +01:00
PSeitz	0aae31d7d7	reduce number of allocations (#2257 ) * reduce number of allocations Explanation makes up around 50% of all allocations (numbers not perf). It's created during serialization but not called. - Make Explanation optional in BM25 - Avoid allocations when using Explanation * use Cow	2023-11-16 13:47:36 +01:00
Paul Masurel	9caab45136	Preparing for 0.21.2 release. (#2256 )	2023-11-15 10:43:36 +09:00
Chris Tam	6d9a7b7eb0	Derive Debug for SchemaBuilder (#2254 )	2023-11-15 01:03:44 +01:00
dependabot[bot]	7a2c5804b1	Update itertools requirement from 0.11.0 to 0.12.0 (#2255 ) Updates the requirements on [itertools](https://github.com/rust-itertools/itertools) to permit the latest version. - [Changelog](https://github.com/rust-itertools/itertools/blob/master/CHANGELOG.md) - [Commits](https://github.com/rust-itertools/itertools/compare/v0.11.0...v0.12.0) --- updated-dependencies: - dependency-name: itertools dependency-type: direct:production ... Signed-off-by: dependabot[bot] <support@github.com> Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>	2023-11-15 01:03:08 +01:00
François Massot	5319977171	Merge pull request #2253 from quickwit-oss/issue/2251-bug-merge-json-object-with-number Fix bug occuring when merging JSON object indexed with positions.	2023-11-14 17:28:29 +01:00
trinity-1686a	828632e8c4	rustfmt	2023-11-14 15:05:16 +01:00
Paul Masurel	6b59ec6fd5	Fix bug occuring when merging JSON object indexed with positions. In JSON Object field the presence of term frequencies depend on the field. Typically, a string with postiions indexed will have positions while numbers won't. The presence or absence of term freqs for a given term is unfortunately encoded in a very passive way. It is given by the presence of extra information in the skip info, or the lack of term freqs after decoding vint blocks. Before, after writing a segment, we would encode the segment correctly (without any term freq for number in json object field). However during merge, we would get the default term freq=1 value. (this is default in the absence of encoded term freqs) The merger would then proceed and attempt to decode 1 position when there are in fact none. This PR requires to explictly tell the posting serialize whether term frequencies should be serialized for each new term. Closes #2251	2023-11-14 22:41:48 +09:00
PSeitz	b60d862150	docid deltas while indexing (#2249 ) * docid deltas while indexing storing deltas is especially helpful for repetitive data like logs. In those cases, recording a doc on a term costed 4 bytes instead of 1 byte now. HDFS Indexing 1.1GB Total memory consumption: Before: 760 MB Now: 590 MB * use scan for delta decoding	2023-11-13 05:14:27 +01:00
PSeitz	4837c7811a	add missing inlines (#2245 )	2023-11-10 08:00:42 +01:00
PSeitz	5a2397d57e	add sstable ord_to_term benchmark (#2242 )	2023-11-10 07:27:48 +01:00
PSeitz	927b4432c9	Perf: use term hashmap in fastfield (#2243 ) * add shared arena hashmap * bench fastfield indexing * use shared arena hashmap in columnar lower minimum resize in hashtable * clippy * add comments	2023-11-09 13:44:02 +01:00