Introducing a column trait

Small refactoring estimate.
Removing Deserializer trait
2026-03-01 11:10:37 +00:00 · 2022-08-27 22:48:05 +02:00 · 2022-08-27 21:53:46 +02:00 · 2022-08-27 21:11:54 +02:00
21 changed files with 206 additions and 304 deletions
--- a/common/src/writer.rs
+++ b/common/src/writer.rs
@@ -62,7 +62,7 @@ impl<W: TerminatingWrite> TerminatingWrite for CountingWriter<W> {
 pub struct AntiCallToken(());

 /// Trait used to indicate when no more write need to be done on a writer
-pub trait TerminatingWrite: Write + Send + Sync {
+pub trait TerminatingWrite: Write + Send {
    /// Indicate that the writer will no longer be used. Internally call terminate_ref.
    fn terminate(mut self) -> io::Result<()>
    where Self: Sized {
--- a/fastfield_codecs/src/blockwise_linear.rs
+++ b/fastfield_codecs/src/blockwise_linear.rs
@@ -289,7 +289,6 @@ impl FastFieldCodec for BlockwiseLinearCodec {
    /// estimation for linear interpolation is hard because, you don't know
    /// where the local maxima are for the deviation of the calculated value and
    /// the offset is also unknown.
-    #[allow(clippy::question_mark)]
    fn estimate(fastfield_accessor: &impl Column) -> Option<f32> {
        if fastfield_accessor.num_vals() < 10 * CHUNK_SIZE {
            return None;
--- a/fastfield_codecs/src/lib.rs
+++ b/fastfield_codecs/src/lib.rs
@@ -16,6 +16,7 @@ mod column;

 pub use self::column::Column;

+
 #[derive(PartialEq, Eq, PartialOrd, Ord, Debug, Clone, Copy)]
 #[repr(u8)]
 pub enum FastFieldCodecType {
@@ -90,41 +91,52 @@ pub struct FastFieldStats {
    pub num_vals: u64,
 }

-struct VecColum<'a>(&'a [u64]);
-impl<'a> Column for VecColum<'a> {
-
+impl<'a> Column for &'a [u64] {
    fn get_val(&self, position: u64) -> u64 {
-        self.0[position as usize]
+        self[position as usize]
    }

    fn iter<'b>(&'b self) -> Box<dyn Iterator<Item = u64> + 'b> {
-        Box::new(self.0.iter().cloned())
+        Box::new((self as &[u64]).iter().cloned())
    }

    fn min_value(&self) -> u64 {
-        self.0.iter().min().cloned().unwrap_or(0)
+        self.iter().min().unwrap_or(0)
    }

    fn max_value(&self) -> u64 {
-        self.0.iter().max().cloned().unwrap_or(0)
+        self.iter().max().unwrap_or(0)
    }

    fn num_vals(&self) -> u64 {
-        self.0.len() as u64
+        self.len() as u64
    }
 }

-impl<'a> From<&'a [u64]> for VecColum<'a> {
-    fn from(data: &'a [u64]) -> Self {
-        Self(data)
+impl Column for Vec<u64> {
+    fn get_val(&self, position: u64) -> u64 {
+        self[position as usize]
+    }
+    fn iter<'b>(&'b self) -> Box<dyn Iterator<Item = u64> + 'b> {
+        Box::new((self as &[u64]).iter().cloned())
+    }
+    fn min_value(&self) -> u64 {
+        self.iter().min().unwrap_or(0)
+    }
+
+    fn max_value(&self) -> u64 {
+        self.iter().max().unwrap_or(0)
+    }
+
+    fn num_vals(&self) -> u64 {
+        self.len() as u64
    }
 }

 #[cfg(test)]
 mod tests {
-    use proptest::prelude::*;
-    use proptest::strategy::Strategy;
-    use proptest::{prop_oneof, proptest};
+    use proptest::arbitrary::any;
+    use proptest::proptest;

    use crate::bitpacked::BitpackedCodec;
    use crate::blockwise_linear::BlockwiseLinearCodec;
@@ -134,10 +146,10 @@ mod tests {
        data: &[u64],
        name: &str,
    ) -> Option<(f32, f32)> {
-        let estimation = Codec::estimate(&VecColum::from(data))?;
+        let estimation = Codec::estimate(&data)?;

        let mut out: Vec<u8> = Vec::new();
-        Codec::serialize(&mut out, &VecColum::from(data)).unwrap();
+        Codec::serialize(&mut out, &data).unwrap();

        let actual_compression = out.len() as f32 / (data.len() as f32 * 8.0);

@@ -155,31 +167,20 @@ mod tests {
    }

    proptest! {
-        #![proptest_config(ProptestConfig::with_cases(100))]
        #[test]
-        fn test_proptest_small(data in proptest::collection::vec(num_strategy(), 1..10)) {
-            create_and_validate::<LinearCodec>(&data, "proptest linearinterpol");
-            create_and_validate::<BlockwiseLinearCodec>(&data, "proptest multilinearinterpol");
-            create_and_validate::<BitpackedCodec>(&data, "proptest bitpacked");
-        }
-    }
-
-    proptest! {
-        #![proptest_config(ProptestConfig::with_cases(10))]
-        #[test]
-        fn test_proptest_large(data in proptest::collection::vec(num_strategy(), 1..6000)) {
+        fn test_proptest_small(data in proptest::collection::vec(any::<u64>(), 1..10)) {
+            create_and_validate::<LinearCodec>(&data, "proptest linearinterpol");
+            create_and_validate::<BlockwiseLinearCodec>(&data, "proptest multilinearinterpol");
+            create_and_validate::<BitpackedCodec>(&data, "proptest bitpacked");
+        }
+
+        #[test]
+        fn test_proptest_large(data in proptest::collection::vec(any::<u64>(), 1..6000)) {
            create_and_validate::<LinearCodec>(&data, "proptest linearinterpol");
            create_and_validate::<BlockwiseLinearCodec>(&data, "proptest multilinearinterpol");
            create_and_validate::<BitpackedCodec>(&data, "proptest bitpacked");
        }

-    }
-    fn num_strategy() -> impl Strategy<Value = u64> {
-        prop_oneof![
-            1 => prop::num::u64::ANY.prop_map(|num| u64::MAX - (num % 10) ),
-            1 => prop::num::u64::ANY.prop_map(|num| num % 10 ),
-            20 => prop::num::u64::ANY,
-        ]
    }

    pub fn get_codec_test_datasets() -> Vec<(Vec<u64>, &'static str)> {
@@ -195,11 +196,6 @@ mod tests {
        data_and_names.push((vec![5, 50, 3, 13, 1, 1000, 35], "rand small"));
        data_and_names.push((vec![10], "single value"));

-        data_and_names.push((
-            vec![1572656989877777, 1170935903116329, 720575940379279, 0],
-            "overflow error",
-        ));
-
        data_and_names
    }

@@ -234,7 +230,6 @@ mod tests {
    #[test]
    fn estimation_good_interpolation_case() {
        let data = (10..=20000_u64).collect::<Vec<_>>();
-        let data: VecColum = data.as_slice().into();

        let linear_interpol_estimation = LinearCodec::estimate(&data).unwrap();
        assert_le!(linear_interpol_estimation, 0.01);
@@ -248,9 +243,8 @@ mod tests {
    }
    #[test]
    fn estimation_test_bad_interpolation_case() {
-        let data: &[u64] = &[200, 10, 10, 10, 10, 1000, 20];
+        let data = vec![200, 10, 10, 10, 10, 1000, 20];

-        let data: VecColum = data.into();
        let linear_interpol_estimation = LinearCodec::estimate(&data).unwrap();
        assert_le!(linear_interpol_estimation, 0.32);

@@ -261,7 +255,6 @@ mod tests {
    fn estimation_test_bad_interpolation_case_monotonically_increasing() {
        let mut data: Vec<u64> = (200..=20000_u64).collect();
        data.push(1_000_000);
-        let data: VecColum = data.as_slice().into();

        // in this case the linear interpolation can't in fact not be worse than bitpacking,
        // but the estimator adds some threshold, which leads to estimated worse behavior
--- a/fastfield_codecs/src/linear.rs
+++ b/fastfield_codecs/src/linear.rs
@@ -115,9 +115,9 @@ fn diff(val1: u64, val2: u64) -> f64 {
 #[inline]
 pub fn get_calculated_value(first_val: u64, pos: u64, slope: f32) -> u64 {
    if slope < 0.0 {
-        first_val.saturating_sub((pos as f32 * -slope) as u64)
+        first_val - (pos as f32 * -slope) as u64
    } else {
-        first_val.saturating_add((pos as f32 * slope) as u64)
+        first_val + (pos as f32 * slope) as u64
    }
 }

@@ -193,7 +193,6 @@ impl FastFieldCodec for LinearCodec {
    /// estimation for linear interpolation is hard because, you don't know
    /// where the local maxima for the deviation of the calculated value are and
    /// the offset to shift all values to >=0 is also unknown.
-    #[allow(clippy::question_mark)]
    fn estimate(fastfield_accessor: &impl Column) -> Option<f32> {
        if fastfield_accessor.num_vals() < 3 {
            return None; // disable compressor for this case
@@ -259,8 +258,6 @@ fn distance<T: Sub<Output = T> + Ord>(x: T, y: T) -> T {

 #[cfg(test)]
 mod tests {
-    use rand::RngCore;
-
    use super::*;
    use crate::tests::get_codec_test_datasets;

@@ -317,13 +314,6 @@ mod tests {

        create_and_validate(&data, "large amplitude");
    }
-
-    #[test]
-    fn overflow_error_test() {
-        let data = vec![1572656989877777, 1170935903116329, 720575940379279, 0];
-        create_and_validate(&data, "overflow test");
-    }
-
    #[test]
    fn linear_interpol_fast_concave_data() {
        let data = vec![0, 1, 2, 5, 8, 10, 20, 50];
@@ -343,9 +333,10 @@ mod tests {

    #[test]
    fn linear_interpol_fast_field_rand() {
-        let mut rng = rand::thread_rng();
-        for _ in 0..50 {
-            let mut data = (0..10_000).map(|_| rng.next_u64()).collect::<Vec<_>>();
+        for _ in 0..5000 {
+            let mut data = (0..10_000)
+                .map(|_| rand::random::<u64>())
+                .collect::<Vec<_>>();
            create_and_validate(&data, "random");
            data.reverse();
            create_and_validate(&data, "random");
--- a/fastfield_codecs/src/main.rs
+++ b/fastfield_codecs/src/main.rs
@@ -3,33 +3,9 @@ extern crate prettytable;
 use fastfield_codecs::bitpacked::BitpackedCodec;
 use fastfield_codecs::blockwise_linear::BlockwiseLinearCodec;
 use fastfield_codecs::linear::LinearCodec;
-use fastfield_codecs::{Column, FastFieldCodec, FastFieldCodecType, FastFieldStats};
+use fastfield_codecs::{FastFieldCodec, FastFieldCodecType, FastFieldStats};
 use prettytable::{Cell, Row, Table};

-struct Data<'a>(&'a [u64]);
-
-impl<'a> Column for Data<'a> {
-    fn get_val(&self, position: u64) -> u64 {
-        self.0[position as usize]
-    }
-
-    fn iter<'b>(&'b self) -> Box<dyn Iterator<Item = u64> + 'b> {
-        Box::new(self.0.iter().cloned())
-    }
-
-    fn min_value(&self) -> u64 {
-        *self.0.iter().min().unwrap_or(&0)
-    }
-
-    fn max_value(&self) -> u64 {
-        *self.0.iter().max().unwrap_or(&0)
-    }
-
-    fn num_vals(&self) -> u64 {
-        self.0.len() as u64
-    }
-}
-
 fn main() {
    let mut table = Table::new();

@@ -110,11 +86,10 @@ pub fn get_codec_test_data_sets() -> Vec<(Vec<u64>, &'static str)> {
 pub fn serialize_with_codec<C: FastFieldCodec>(
    data: &[u64],
 ) -> Option<(f32, f32, FastFieldCodecType)> {
-    let data = Data(data);
    let estimation = C::estimate(&data)?;
    let mut out = Vec::new();
    C::serialize(&mut out, &data).unwrap();
-    let actual_compression = out.len() as f32 / (data.num_vals() * 8) as f32;
+    let actual_compression = out.len() as f32 / (data.len() * 8) as f32;
    Some((estimation, actual_compression, C::CODEC_TYPE))
 }

--- a/ownedbytes/src/lib.rs
+++ b/ownedbytes/src/lib.rs
@@ -6,7 +6,7 @@ use std::{fmt, io, mem};
 use stable_deref_trait::StableDeref;

 /// An OwnedBytes simply wraps an object that owns a slice of data and exposes
-/// this data as a slice.
+/// this data as a static slice.
 ///
 /// The backing object is required to be `StableDeref`.
 #[derive(Clone)]
--- a/src/collector/top_score_collector.rs
+++ b/src/collector/top_score_collector.rs
@@ -409,7 +409,7 @@ impl TopDocs {
    /// # use tantivy::query::QueryParser;
    /// use tantivy::SegmentReader;
    /// use tantivy::collector::TopDocs;
-    /// use tantivy::fastfield::Column;
+    /// use tantivy::fastfield::FastFieldReader;
    /// use tantivy::schema::Field;
    ///
    /// fn create_schema() -> Schema {
@@ -458,7 +458,7 @@ impl TopDocs {
    ///
    ///             // We can now define our actual scoring function
    ///             move |doc: DocId, original_score: Score| {
-    ///                 let popularity: u64 = popularity_reader.get_val(doc as u64);
+    ///                 let popularity: u64 = popularity_reader.get(doc);
    ///                 // Well.. For the sake of the example we use a simple logarithm
    ///                 // function.
    ///                 let popularity_boost_score = ((2u64 + popularity) as Score).log2();
@@ -517,7 +517,7 @@ impl TopDocs {
    /// use tantivy::SegmentReader;
    /// use tantivy::collector::TopDocs;
    /// use tantivy::schema::Field;
-    /// use fastfield_codecs::Column;
+    /// use tantivy::fastfield::FastFieldReader;
    ///
    /// # fn create_schema() -> Schema {
    /// #    let mut schema_builder = Schema::builder();
@@ -569,8 +569,8 @@ impl TopDocs {
    ///
    ///             // We can now define our actual scoring function
    ///             move |doc: DocId| {
-    ///                 let popularity: u64 = popularity_reader.get_val(doc as u64);
-    ///                 let boosted: u64 = boosted_reader.get_val(doc as u64);
+    ///                 let popularity: u64 = popularity_reader.get(doc);
+    ///                 let boosted: u64 = boosted_reader.get(doc);
    ///                 // Score do not have to be `f64` in tantivy.
    ///                 // Here we return a couple to get lexicographical order
    ///                 // for free.
--- a/src/core/index.rs
+++ b/src/core/index.rs
@@ -7,7 +7,6 @@ use std::sync::Arc;

 use super::segment::Segment;
 use super::IndexSettings;
-use crate::core::single_segment_index_writer::SingleSegmentIndexWriter;
 use crate::core::{
    Executor, IndexMeta, SegmentId, SegmentMeta, SegmentMetaInventory, META_FILEPATH,
 };
@@ -17,7 +16,7 @@ use crate::directory::MmapDirectory;
 use crate::directory::{Directory, ManagedDirectory, RamDirectory, INDEX_WRITER_LOCK};
 use crate::error::{DataCorruption, TantivyError};
 use crate::indexer::index_writer::{MAX_NUM_THREAD, MEMORY_ARENA_NUM_BYTES_MIN};
-use crate::indexer::segment_updater::save_metas;
+use crate::indexer::segment_updater::save_new_metas;
 use crate::reader::{IndexReader, IndexReaderBuilder};
 use crate::schema::{Field, FieldType, Schema};
 use crate::tokenizer::{TextAnalyzer, TokenizerManager};
@@ -48,34 +47,6 @@ fn load_metas(
        .map_err(From::from)
 }

-/// Save the index meta file.
-/// This operation is atomic :
-/// Either
-///  - it fails, in which case an error is returned,
-/// and the `meta.json` remains untouched,
-/// - it succeeds, and `meta.json` is written
-/// and flushed.
-///
-/// This method is not part of tantivy's public API
-fn save_new_metas(
-    schema: Schema,
-    index_settings: IndexSettings,
-    directory: &dyn Directory,
-) -> crate::Result<()> {
-    save_metas(
-        &IndexMeta {
-            index_settings,
-            segments: Vec::new(),
-            schema,
-            opstamp: 0u64,
-            payload: None,
-        },
-        directory,
-    )?;
-    directory.sync_directory()?;
-    Ok(())
-}
-
 /// IndexBuilder can be used to create an index.
 ///
 /// Use in conjunction with `SchemaBuilder`. Global index settings
@@ -164,25 +135,6 @@ impl IndexBuilder {
        self.create(mmap_directory)
    }

-    /// Dragons ahead!!!
-    ///
-    /// The point of this API is to let users create a simple index with a single segment
-    /// and without starting any thread.
-    ///
-    /// Do not use this method if you are not sure what you are doing.
-    ///
-    /// It expects an originally empty directory, and will not run any GC operation.
-    #[doc(hidden)]
-    pub fn single_segment_index_writer(
-        self,
-        dir: impl Into<Box<dyn Directory>>,
-        mem_budget: usize,
-    ) -> crate::Result<SingleSegmentIndexWriter> {
-        let index = self.create(dir)?;
-        let index_simple_writer = SingleSegmentIndexWriter::new(index, mem_budget)?;
-        Ok(index_simple_writer)
-    }
-
    /// Creates a new index in a temp directory.
    ///
    /// The index will use the `MMapDirectory` in a newly created directory.
@@ -628,12 +580,10 @@ impl fmt::Debug for Index {

 #[cfg(test)]
 mod tests {
-    use crate::collector::Count;
    use crate::directory::{RamDirectory, WatchCallback};
-    use crate::query::TermQuery;
-    use crate::schema::{Field, IndexRecordOption, Schema, INDEXED, TEXT};
+    use crate::schema::{Field, Schema, INDEXED, TEXT};
    use crate::tokenizer::TokenizerManager;
-    use crate::{Directory, Index, IndexBuilder, IndexReader, IndexSettings, ReloadPolicy, Term};
+    use crate::{Directory, Index, IndexBuilder, IndexReader, IndexSettings, ReloadPolicy};

    #[test]
    fn test_indexer_for_field() {
@@ -899,28 +849,4 @@ mod tests {
        );
        Ok(())
    }
-
-    #[test]
-    fn test_single_segment_index_writer() -> crate::Result<()> {
-        let mut schema_builder = Schema::builder();
-        let text_field = schema_builder.add_text_field("text", TEXT);
-        let schema = schema_builder.build();
-        let directory = RamDirectory::default();
-        let mut single_segment_index_writer = Index::builder()
-            .schema(schema)
-            .single_segment_index_writer(directory, 10_000_000)?;
-        for _ in 0..10 {
-            let doc = doc!(text_field=>"hello");
-            single_segment_index_writer.add_document(doc)?;
-        }
-        let index = single_segment_index_writer.finalize()?;
-        let searcher = index.reader()?.searcher();
-        let term_query = TermQuery::new(
-            Term::from_field_text(text_field, "hello"),
-            IndexRecordOption::Basic,
-        );
-        let count = searcher.search(&term_query, &Count)?;
-        assert_eq!(count, 10);
-        Ok(())
-    }
 }
--- a/src/core/mod.rs
+++ b/src/core/mod.rs
@@ -7,7 +7,6 @@ mod segment;
 mod segment_component;
 mod segment_id;
 mod segment_reader;
-mod single_segment_index_writer;

 use std::path::Path;

@@ -24,7 +23,6 @@ pub use self::segment::Segment;
 pub use self::segment_component::SegmentComponent;
 pub use self::segment_id::SegmentId;
 pub use self::segment_reader::SegmentReader;
-pub use self::single_segment_index_writer::SingleSegmentIndexWriter;

 /// The meta file contains all the information about the list of segments and the schema
 /// of the index.
--- a/src/core/single_segment_index_writer.rs
+++ b/src/core/single_segment_index_writer.rs
@@ -1,47 +0,0 @@
-use crate::indexer::operation::AddOperation;
-use crate::indexer::segment_updater::save_metas;
-use crate::indexer::SegmentWriter;
-use crate::{Directory, Document, Index, IndexMeta, Opstamp, Segment};
-
-#[doc(hidden)]
-pub struct SingleSegmentIndexWriter {
-    segment_writer: SegmentWriter,
-    segment: Segment,
-    opstamp: Opstamp,
-}
-
-impl SingleSegmentIndexWriter {
-    pub fn new(index: Index, mem_budget: usize) -> crate::Result<Self> {
-        let segment = index.new_segment();
-        let segment_writer = SegmentWriter::for_segment(mem_budget, segment.clone())?;
-        Ok(Self {
-            segment_writer,
-            segment,
-            opstamp: 0,
-        })
-    }
-
-    pub fn add_document(&mut self, document: Document) -> crate::Result<()> {
-        let opstamp = self.opstamp;
-        self.opstamp += 1;
-        self.segment_writer
-            .add_document(AddOperation { opstamp, document })
-    }
-
-    pub fn finalize(self) -> crate::Result<Index> {
-        let max_doc = self.segment_writer.max_doc();
-        self.segment_writer.finalize()?;
-        let segment: Segment = self.segment.with_max_doc(max_doc);
-        let index = segment.index();
-        let index_meta = IndexMeta {
-            index_settings: index.settings().clone(),
-            segments: vec![segment.meta().clone()],
-            schema: index.schema(),
-            opstamp: 0,
-            payload: None,
-        };
-        save_metas(&index_meta, index.directory())?;
-        index.directory().sync_directory()?;
-        Ok(segment.index().clone())
-    }
-}
--- a/src/fastfield/gcd.rs
+++ b/src/fastfield/gcd.rs
@@ -210,7 +210,7 @@ mod tests {
    #[test]
    fn test_fastfield_gcd_i64() -> crate::Result<()> {
        for &code_type in ALL_CODECS {
-            test_fastfield_gcd_i64_with_codec(code_type, 5500)?;
+            test_fastfield_gcd_i64_with_codec(code_type, 5005)?;
        }
        Ok(())
    }
@@ -251,7 +251,7 @@ mod tests {
    #[test]
    fn test_fastfield_gcd_u64() -> crate::Result<()> {
        for &code_type in ALL_CODECS {
-            test_fastfield_gcd_u64_with_codec(code_type, 5500)?;
+            test_fastfield_gcd_u64_with_codec(code_type, 5005)?;
        }
        Ok(())
    }
--- a/src/fastfield/mod.rs
+++ b/src/fastfield/mod.rs
@@ -978,20 +978,37 @@ mod tests {

 #[cfg(all(test, feature = "unstable"))]
 mod bench {
-    use fastfield_codecs::Column;
+    use std::collections::HashMap;
+    use std::path::Path;
+
    use test::{self, Bencher};

-    use super::tests::generate_permutation;
+    use super::tests::{generate_permutation, FIELD, SCHEMA};
    use super::*;
+    use crate::directory::{CompositeFile, Directory, RamDirectory, WritePtr};
    use crate::fastfield::tests::generate_permutation_gcd;
+    use crate::fastfield::FastFieldReader;

    #[bench]
-    fn bench_intfastfield_jumpy_veclookup(b: &mut Bencher) {
+    fn bench_intfastfield_linear_veclookup(b: &mut Bencher) {
        let permutation = generate_permutation();
-        let n = permutation.len();
        b.iter(|| {
+            let n = test::black_box(7000u32);
            let mut a = 0u64;
-            for _ in 0..n {
+            for i in (0u32..n / 7).map(|v| v * 7) {
+                a ^= permutation[i as usize];
+            }
+            a
+        });
+    }
+
+    #[bench]
+    fn bench_intfastfield_veclookup(b: &mut Bencher) {
+        let permutation = generate_permutation();
+        b.iter(|| {
+            let n = test::black_box(1000u32);
+            let mut a = 0u64;
+            for _ in 0u32..n {
                a = permutation[a as usize];
            }
            a
@@ -999,83 +1016,102 @@ mod bench {
    }

    #[bench]
-    fn bench_intfastfield_jumpy_fflookup(b: &mut Bencher) {
+    fn bench_intfastfield_linear_fflookup(b: &mut Bencher) {
+        let path = Path::new("test");
        let permutation = generate_permutation();
-        let n = permutation.len();
-        let column = DynamicFastFieldReader::from(permutation);
-        b.iter(|| {
-            let mut a = 0u64;
-            for _ in 0..n {
-                a = column.get_val(a as u64);
+        let directory: RamDirectory = RamDirectory::create();
+        {
+            let write: WritePtr = directory.open_write(Path::new("test")).unwrap();
+            let mut serializer = CompositeFastFieldSerializer::from_write(write).unwrap();
+            let mut fast_field_writers = FastFieldsWriter::from_schema(&SCHEMA);
+            for &x in &permutation {
+                fast_field_writers.add_document(&doc!(*FIELD=>x));
            }
-            a
-        });
+            fast_field_writers
+                .serialize(&mut serializer, &HashMap::new(), None)
+                .unwrap();
+            serializer.close().unwrap();
+        }
+        let file = directory.open_read(&path).unwrap();
+        {
+            let fast_fields_composite = CompositeFile::open(&file).unwrap();
+            let data = fast_fields_composite.open_read(*FIELD).unwrap();
+            let fast_field_reader = DynamicFastFieldReader::<u64>::open(data).unwrap();
+
+            b.iter(|| {
+                let n = test::black_box(7000u32);
+                let mut a = 0u64;
+                for i in (0u32..n / 7).map(|val| val * 7) {
+                    a ^= fast_field_reader.get(i);
+                }
+                a
+            });
+        }
    }

    #[bench]
-    fn bench_intfastfield_stride7_vec(b: &mut Bencher) {
+    fn bench_intfastfield_fflookup(b: &mut Bencher) {
+        let path = Path::new("test");
        let permutation = generate_permutation();
-        let n = permutation.len();
-        b.iter(|| {
-            let mut a = 0u64;
-            for i in (0..n / 7).map(|val| val * 7) {
-                a += permutation[i as usize];
+        let directory: RamDirectory = RamDirectory::create();
+        {
+            let write: WritePtr = directory.open_write(Path::new("test")).unwrap();
+            let mut serializer = CompositeFastFieldSerializer::from_write(write).unwrap();
+            let mut fast_field_writers = FastFieldsWriter::from_schema(&SCHEMA);
+            for &x in &permutation {
+                fast_field_writers.add_document(&doc!(*FIELD=>x));
            }
-            a
-        });
+            fast_field_writers
+                .serialize(&mut serializer, &HashMap::new(), None)
+                .unwrap();
+            serializer.close().unwrap();
+        }
+        let file = directory.open_read(&path).unwrap();
+        {
+            let fast_fields_composite = CompositeFile::open(&file).unwrap();
+            let data = fast_fields_composite.open_read(*FIELD).unwrap();
+            let fast_field_reader = DynamicFastFieldReader::<u64>::open(data).unwrap();
+
+            b.iter(|| {
+                let mut a = 0u32;
+                for i in 0u32..permutation.len() as u32 {
+                    a = fast_field_reader.get(i) as u32;
+                }
+                a
+            });
+        }
    }

    #[bench]
-    fn bench_intfastfield_stride7_fflookup(b: &mut Bencher) {
-        let permutation = generate_permutation();
-        let n = permutation.len();
-        let column = DynamicFastFieldReader::from(permutation);
-        b.iter(|| {
-            let mut a = 0u64;
-            for i in (0..n / 7).map(|val| val * 7) {
-                a += column.get_val(i as u64);
-            }
-            a
-        });
-    }
-
-    #[bench]
-    fn bench_intfastfield_scan_all_fflookup(b: &mut Bencher) {
-        let permutation = generate_permutation();
-        let n = permutation.len();
-        let column = DynamicFastFieldReader::from(permutation);
-        b.iter(|| {
-            let mut a = 0u64;
-            for i in 0u64..n as u64 {
-                a += column.get_val(i);
-            }
-            a
-        });
-    }
-
-    #[bench]
-    fn bench_intfastfield_scan_all_fflookup_gcd(b: &mut Bencher) {
+    fn bench_intfastfield_fflookup_gcd(b: &mut Bencher) {
+        let path = Path::new("test");
        let permutation = generate_permutation_gcd();
-        let n = permutation.len();
-        let column = DynamicFastFieldReader::from(permutation);
-        b.iter(|| {
-            let mut a = 0u64;
-            for i in 0..n as u64 {
-                a += column.get_val(i);
+        let directory: RamDirectory = RamDirectory::create();
+        {
+            let write: WritePtr = directory.open_write(Path::new("test")).unwrap();
+            let mut serializer = CompositeFastFieldSerializer::from_write(write).unwrap();
+            let mut fast_field_writers = FastFieldsWriter::from_schema(&SCHEMA);
+            for &x in &permutation {
+                fast_field_writers.add_document(&doc!(*FIELD=>x));
            }
-            a
-        });
-    }
+            fast_field_writers
+                .serialize(&mut serializer, &HashMap::new(), None)
+                .unwrap();
+            serializer.close().unwrap();
+        }
+        let file = directory.open_read(&path).unwrap();
+        {
+            let fast_fields_composite = CompositeFile::open(&file).unwrap();
+            let data = fast_fields_composite.open_read(*FIELD).unwrap();
+            let fast_field_reader = DynamicFastFieldReader::<u64>::open(data).unwrap();

-    #[bench]
-    fn bench_intfastfield_scan_all_vec(b: &mut Bencher) {
-        let permutation = generate_permutation();
-        b.iter(|| {
-            let mut a = 0u64;
-            for i in 0..permutation.len() {
-                a += permutation[i as usize] as u64;
-            }
-            a
-        });
+            b.iter(|| {
+                let mut a = 0u32;
+                for i in 0u32..permutation.len() as u32 {
+                    a = fast_field_reader.get(i) as u32;
+                }
+                a
+            });
+        }
    }
 }
--- a/src/indexer/index_writer.rs
+++ b/src/indexer/index_writer.rs
@@ -174,7 +174,9 @@ fn index_documents(
    segment_updater: &mut SegmentUpdater,
    mut delete_cursor: DeleteCursor,
 ) -> crate::Result<()> {
-    let mut segment_writer = SegmentWriter::for_segment(memory_budget, segment.clone())?;
+    let schema = segment.schema();
+
+    let mut segment_writer = SegmentWriter::for_segment(memory_budget, segment.clone(), schema)?;
    for document_group in grouped_document_iterator {
        for doc in document_group {
            segment_writer.add_document(doc)?;
--- a/src/indexer/merger.rs
+++ b/src/indexer/merger.rs
@@ -133,7 +133,7 @@ impl TermOrdinalMapping {
    fn max_term_ord(&self) -> TermOrdinal {
        self.per_segment_new_term_ordinals
            .iter()
-            .flat_map(|term_ordinals| term_ordinals.iter().max().cloned())
+            .flat_map(|term_ordinals| term_ordinals.iter().max())
            .max()
            .unwrap_or_default()
    }
@@ -784,7 +784,7 @@ impl IndexMerger {
                let new_doc_id: DocId =
                    self.offsets
                        .iter()
-                        .position(|&offset| offset > pos)
+                        .position(|offset| offset > pos)
                        .expect("pos is out of bounds") as DocId
                        - 1u32;

--- a/src/indexer/merger_sorted_index_test.rs
+++ b/src/indexer/merger_sorted_index_test.rs
@@ -480,11 +480,11 @@ mod tests {
 #[cfg(all(test, feature = "unstable"))]
 mod bench_sorted_index_merge {

-    use fastfield_codecs::Column;
    use test::{self, Bencher};

    use crate::core::Index;
-    use crate::fastfield::DynamicFastFieldReader;
+    // use cratedoc_id, readerdoc_id_mappinglet vals = reader.fate::schema;
+    use crate::fastfield::{DynamicFastFieldReader, FastFieldReader};
    use crate::indexer::merger::IndexMerger;
    use crate::schema::{Cardinality, NumericOptions, Schema};
    use crate::{IndexSettings, IndexSortByField, IndexWriter, Order};
@@ -546,7 +546,7 @@ mod bench_sorted_index_merge {
            // add values in order of the new doc_ids
            let mut val = 0;
            for (doc_id, _reader, field_reader) in sorted_doc_ids {
-                val = field_reader.get_val(doc_id as u64);
+                val = field_reader.get(doc_id);
            }

            val
--- a/src/indexer/segment_updater.rs
+++ b/src/indexer/segment_updater.rs
@@ -25,10 +25,39 @@ use crate::indexer::{
    DefaultMergePolicy, MergeCandidate, MergeOperation, MergePolicy, SegmentEntry,
    SegmentSerializer,
 };
+use crate::schema::Schema;
 use crate::{FutureResult, Opstamp};

 const NUM_MERGE_THREADS: usize = 4;

+/// Save the index meta file.
+/// This operation is atomic :
+/// Either
+///  - it fails, in which case an error is returned,
+/// and the `meta.json` remains untouched,
+/// - it succeeds, and `meta.json` is written
+/// and flushed.
+///
+/// This method is not part of tantivy's public API
+pub fn save_new_metas(
+    schema: Schema,
+    index_settings: IndexSettings,
+    directory: &dyn Directory,
+) -> crate::Result<()> {
+    save_metas(
+        &IndexMeta {
+            index_settings,
+            segments: Vec::new(),
+            schema,
+            opstamp: 0u64,
+            payload: None,
+        },
+        directory,
+    )?;
+    directory.sync_directory()?;
+    Ok(())
+}
+
 /// Save the index meta file.
 /// This operation is atomic:
 /// Either
@@ -38,7 +67,7 @@ const NUM_MERGE_THREADS: usize = 4;
 /// and flushed.
 ///
 /// This method is not part of tantivy's public API
-pub(crate) fn save_metas(metas: &IndexMeta, directory: &dyn Directory) -> crate::Result<()> {
+fn save_metas(metas: &IndexMeta, directory: &dyn Directory) -> crate::Result<()> {
    info!("save metas");
    let mut buffer = serde_json::to_vec_pretty(metas)?;
    // Just adding a new line at the end of the buffer.
--- a/src/indexer/segment_writer.rs
+++ b/src/indexer/segment_writer.rs
@@ -80,8 +80,8 @@ impl SegmentWriter {
    pub fn for_segment(
        memory_budget_in_bytes: usize,
        segment: Segment,
+        schema: Schema,
    ) -> crate::Result<SegmentWriter> {
-        let schema = segment.schema();
        let tokenizer_manager = segment.index().tokenizers().clone();
        let table_size = compute_initial_table_size(memory_budget_in_bytes)?;
        let segment_serializer = SegmentSerializer::for_segment(segment, false)?;
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -301,7 +301,7 @@ pub use self::docset::{DocSet, TERMINATED};
 pub use crate::core::{
    Executor, Index, IndexBuilder, IndexMeta, IndexSettings, IndexSortByField, InvertedIndexReader,
    Order, Searcher, SearcherGeneration, Segment, SegmentComponent, SegmentId, SegmentMeta,
-    SegmentReader, SingleSegmentIndexWriter,
+    SegmentReader,
 };
 pub use crate::directory::Directory;
 pub use crate::indexer::demuxer::*;
--- a/src/postings/mod.rs
+++ b/src/postings/mod.rs
@@ -227,7 +227,7 @@ pub mod tests {

        {
            let mut segment_writer =
-                SegmentWriter::for_segment(3_000_000, segment.clone()).unwrap();
+                SegmentWriter::for_segment(3_000_000, segment.clone(), schema).unwrap();
            {
                // checking that position works if the field has two values
                let op = AddOperation {
--- a/src/postings/postings_writer.rs
+++ b/src/postings/postings_writer.rs
@@ -116,7 +116,7 @@ pub(crate) struct IndexingPosition {
 /// and building a `Segment` in anonymous memory.
 ///
 /// `PostingsWriter` writes in a `MemoryArena`.
-pub(crate) trait PostingsWriter: Send + Sync {
+pub(crate) trait PostingsWriter {
    /// Record that a document contains a term at a given position.
    ///
    /// * doc  - the document id
--- a/src/postings/recorder.rs
+++ b/src/postings/recorder.rs
@@ -56,7 +56,7 @@ impl<'a> Iterator for VInt32Reader<'a> {
 ///   * the document id
 ///   * the term frequency
 ///   * the term positions
-pub(crate) trait Recorder: Copy + Default + Send + Sync + 'static {
+pub(crate) trait Recorder: Copy + Default + 'static {
    /// Returns the current document
    fn current_doc(&self) -> u32;
    /// Starts recording information about a new document
Author	SHA1	Message	Date
Paul Masurel	1553901f51	Introducing a column trait	2022-08-27 22:48:05 +02:00
Paul Masurel	e8a6e123ae	Small refactoring estimate.	2022-08-27 21:53:46 +02:00
Paul Masurel	43a4c8287c	Removing Deserializer trait And renaming the `Serializer` trait `FastFieldCodec`.	2022-08-27 21:11:54 +02:00