Introducing a column trait

Bench fixing
Adding dragon API to build index without any thread. (#1496 )
2026-01-12 12:02:54 +00:00 · 2022-09-02 11:24:04 +09:00 · 2022-09-02 11:15:44 +09:00 · 2022-09-01 10:32:36 +09:00 · 2022-09-01 10:32:17 +09:00 · 2022-08-31 06:25:25 -07:00
19 changed files with 284 additions and 197 deletions
--- a/common/src/writer.rs
+++ b/common/src/writer.rs
@@ -62,7 +62,7 @@ impl<W: TerminatingWrite> TerminatingWrite for CountingWriter<W> {
 pub struct AntiCallToken(());

 /// Trait used to indicate when no more write need to be done on a writer
-pub trait TerminatingWrite: Write + Send {
+pub trait TerminatingWrite: Write + Send + Sync {
    /// Indicate that the writer will no longer be used. Internally call terminate_ref.
    fn terminate(mut self) -> io::Result<()>
    where Self: Sized {
--- a/fastfield_codecs/src/blockwise_linear.rs
+++ b/fastfield_codecs/src/blockwise_linear.rs
@@ -289,6 +289,7 @@ impl FastFieldCodec for BlockwiseLinearCodec {
    /// estimation for linear interpolation is hard because, you don't know
    /// where the local maxima are for the deviation of the calculated value and
    /// the offset is also unknown.
+    #[allow(clippy::question_mark)]
    fn estimate(fastfield_accessor: &impl Column) -> Option<f32> {
        if fastfield_accessor.num_vals() < 10 * CHUNK_SIZE {
            return None;
--- a/fastfield_codecs/src/lib.rs
+++ b/fastfield_codecs/src/lib.rs
@@ -90,52 +90,41 @@ pub struct FastFieldStats {
    pub num_vals: u64,
 }

-impl<'a> Column for &'a [u64] {
+struct VecColum<'a>(&'a [u64]);
+impl<'a> Column for VecColum<'a> {
+
    fn get_val(&self, position: u64) -> u64 {
-        self[position as usize]
+        self.0[position as usize]
    }

    fn iter<'b>(&'b self) -> Box<dyn Iterator<Item = u64> + 'b> {
-        Box::new((self as &[u64]).iter().cloned())
+        Box::new(self.0.iter().cloned())
    }

    fn min_value(&self) -> u64 {
-        self.iter().min().unwrap_or(0)
+        self.0.iter().min().cloned().unwrap_or(0)
    }

    fn max_value(&self) -> u64 {
-        self.iter().max().unwrap_or(0)
+        self.0.iter().max().cloned().unwrap_or(0)
    }

    fn num_vals(&self) -> u64 {
-        self.len() as u64
+        self.0.len() as u64
    }
 }

-impl Column for Vec<u64> {
-    fn get_val(&self, position: u64) -> u64 {
-        self[position as usize]
-    }
-    fn iter<'b>(&'b self) -> Box<dyn Iterator<Item = u64> + 'b> {
-        Box::new((self as &[u64]).iter().cloned())
-    }
-    fn min_value(&self) -> u64 {
-        self.iter().min().unwrap_or(0)
-    }
-
-    fn max_value(&self) -> u64 {
-        self.iter().max().unwrap_or(0)
-    }
-
-    fn num_vals(&self) -> u64 {
-        self.len() as u64
+impl<'a> From<&'a [u64]> for VecColum<'a> {
+    fn from(data: &'a [u64]) -> Self {
+        Self(data)
    }
 }

 #[cfg(test)]
 mod tests {
-    use proptest::arbitrary::any;
-    use proptest::proptest;
+    use proptest::prelude::*;
+    use proptest::strategy::Strategy;
+    use proptest::{prop_oneof, proptest};

    use crate::bitpacked::BitpackedCodec;
    use crate::blockwise_linear::BlockwiseLinearCodec;
@@ -145,10 +134,10 @@ mod tests {
        data: &[u64],
        name: &str,
    ) -> Option<(f32, f32)> {
-        let estimation = Codec::estimate(&data)?;
+        let estimation = Codec::estimate(&VecColum::from(data))?;

        let mut out: Vec<u8> = Vec::new();
-        Codec::serialize(&mut out, &data).unwrap();
+        Codec::serialize(&mut out, &VecColum::from(data)).unwrap();

        let actual_compression = out.len() as f32 / (data.len() as f32 * 8.0);

@@ -166,21 +155,32 @@ mod tests {
    }

    proptest! {
+        #![proptest_config(ProptestConfig::with_cases(100))]
        #[test]
-        fn test_proptest_small(data in proptest::collection::vec(any::<u64>(), 1..10)) {
-            create_and_validate::<LinearCodec>(&data, "proptest linearinterpol");
-            create_and_validate::<BlockwiseLinearCodec>(&data, "proptest multilinearinterpol");
-            create_and_validate::<BitpackedCodec>(&data, "proptest bitpacked");
-        }
-
-        #[test]
-        fn test_proptest_large(data in proptest::collection::vec(any::<u64>(), 1..6000)) {
+        fn test_proptest_small(data in proptest::collection::vec(num_strategy(), 1..10)) {
+            create_and_validate::<LinearCodec>(&data, "proptest linearinterpol");
+            create_and_validate::<BlockwiseLinearCodec>(&data, "proptest multilinearinterpol");
+            create_and_validate::<BitpackedCodec>(&data, "proptest bitpacked");
+        }
+    }
+
+    proptest! {
+        #![proptest_config(ProptestConfig::with_cases(10))]
+        #[test]
+        fn test_proptest_large(data in proptest::collection::vec(num_strategy(), 1..6000)) {
            create_and_validate::<LinearCodec>(&data, "proptest linearinterpol");
            create_and_validate::<BlockwiseLinearCodec>(&data, "proptest multilinearinterpol");
            create_and_validate::<BitpackedCodec>(&data, "proptest bitpacked");
        }

    }
+    fn num_strategy() -> impl Strategy<Value = u64> {
+        prop_oneof![
+            1 => prop::num::u64::ANY.prop_map(|num| u64::MAX - (num % 10) ),
+            1 => prop::num::u64::ANY.prop_map(|num| num % 10 ),
+            20 => prop::num::u64::ANY,
+        ]
+    }

    pub fn get_codec_test_datasets() -> Vec<(Vec<u64>, &'static str)> {
        let mut data_and_names = vec![];
@@ -234,6 +234,7 @@ mod tests {
    #[test]
    fn estimation_good_interpolation_case() {
        let data = (10..=20000_u64).collect::<Vec<_>>();
+        let data: VecColum = data.as_slice().into();

        let linear_interpol_estimation = LinearCodec::estimate(&data).unwrap();
        assert_le!(linear_interpol_estimation, 0.01);
@@ -247,8 +248,9 @@ mod tests {
    }
    #[test]
    fn estimation_test_bad_interpolation_case() {
-        let data = vec![200, 10, 10, 10, 10, 1000, 20];
+        let data: &[u64] = &[200, 10, 10, 10, 10, 1000, 20];

+        let data: VecColum = data.into();
        let linear_interpol_estimation = LinearCodec::estimate(&data).unwrap();
        assert_le!(linear_interpol_estimation, 0.32);

@@ -259,6 +261,7 @@ mod tests {
    fn estimation_test_bad_interpolation_case_monotonically_increasing() {
        let mut data: Vec<u64> = (200..=20000_u64).collect();
        data.push(1_000_000);
+        let data: VecColum = data.as_slice().into();

        // in this case the linear interpolation can't in fact not be worse than bitpacking,
        // but the estimator adds some threshold, which leads to estimated worse behavior
--- a/fastfield_codecs/src/linear.rs
+++ b/fastfield_codecs/src/linear.rs
@@ -193,6 +193,7 @@ impl FastFieldCodec for LinearCodec {
    /// estimation for linear interpolation is hard because, you don't know
    /// where the local maxima for the deviation of the calculated value are and
    /// the offset to shift all values to >=0 is also unknown.
+    #[allow(clippy::question_mark)]
    fn estimate(fastfield_accessor: &impl Column) -> Option<f32> {
        if fastfield_accessor.num_vals() < 3 {
            return None; // disable compressor for this case
@@ -258,6 +259,8 @@ fn distance<T: Sub<Output = T> + Ord>(x: T, y: T) -> T {

 #[cfg(test)]
 mod tests {
+    use rand::RngCore;
+
    use super::*;
    use crate::tests::get_codec_test_datasets;

@@ -340,10 +343,9 @@ mod tests {

    #[test]
    fn linear_interpol_fast_field_rand() {
-        for _ in 0..5000 {
-            let mut data = (0..10_000)
-                .map(|_| rand::random::<u64>())
-                .collect::<Vec<_>>();
+        let mut rng = rand::thread_rng();
+        for _ in 0..50 {
+            let mut data = (0..10_000).map(|_| rng.next_u64()).collect::<Vec<_>>();
            create_and_validate(&data, "random");
            data.reverse();
            create_and_validate(&data, "random");
--- a/fastfield_codecs/src/main.rs
+++ b/fastfield_codecs/src/main.rs
@@ -3,9 +3,33 @@ extern crate prettytable;
 use fastfield_codecs::bitpacked::BitpackedCodec;
 use fastfield_codecs::blockwise_linear::BlockwiseLinearCodec;
 use fastfield_codecs::linear::LinearCodec;
-use fastfield_codecs::{FastFieldCodec, FastFieldCodecType, FastFieldStats};
+use fastfield_codecs::{Column, FastFieldCodec, FastFieldCodecType, FastFieldStats};
 use prettytable::{Cell, Row, Table};

+struct Data<'a>(&'a [u64]);
+
+impl<'a> Column for Data<'a> {
+    fn get_val(&self, position: u64) -> u64 {
+        self.0[position as usize]
+    }
+
+    fn iter<'b>(&'b self) -> Box<dyn Iterator<Item = u64> + 'b> {
+        Box::new(self.0.iter().cloned())
+    }
+
+    fn min_value(&self) -> u64 {
+        *self.0.iter().min().unwrap_or(&0)
+    }
+
+    fn max_value(&self) -> u64 {
+        *self.0.iter().max().unwrap_or(&0)
+    }
+
+    fn num_vals(&self) -> u64 {
+        self.0.len() as u64
+    }
+}
+
 fn main() {
    let mut table = Table::new();

@@ -86,10 +110,11 @@ pub fn get_codec_test_data_sets() -> Vec<(Vec<u64>, &'static str)> {
 pub fn serialize_with_codec<C: FastFieldCodec>(
    data: &[u64],
 ) -> Option<(f32, f32, FastFieldCodecType)> {
+    let data = Data(data);
    let estimation = C::estimate(&data)?;
    let mut out = Vec::new();
    C::serialize(&mut out, &data).unwrap();
-    let actual_compression = out.len() as f32 / (data.len() * 8) as f32;
+    let actual_compression = out.len() as f32 / (data.num_vals() * 8) as f32;
    Some((estimation, actual_compression, C::CODEC_TYPE))
 }

--- a/ownedbytes/src/lib.rs
+++ b/ownedbytes/src/lib.rs
@@ -6,7 +6,7 @@ use std::{fmt, io, mem};
 use stable_deref_trait::StableDeref;

 /// An OwnedBytes simply wraps an object that owns a slice of data and exposes
-/// this data as a static slice.
+/// this data as a slice.
 ///
 /// The backing object is required to be `StableDeref`.
 #[derive(Clone)]
--- a/src/core/index.rs
+++ b/src/core/index.rs
@@ -7,6 +7,7 @@ use std::sync::Arc;

 use super::segment::Segment;
 use super::IndexSettings;
+use crate::core::single_segment_index_writer::SingleSegmentIndexWriter;
 use crate::core::{
    Executor, IndexMeta, SegmentId, SegmentMeta, SegmentMetaInventory, META_FILEPATH,
 };
@@ -16,7 +17,7 @@ use crate::directory::MmapDirectory;
 use crate::directory::{Directory, ManagedDirectory, RamDirectory, INDEX_WRITER_LOCK};
 use crate::error::{DataCorruption, TantivyError};
 use crate::indexer::index_writer::{MAX_NUM_THREAD, MEMORY_ARENA_NUM_BYTES_MIN};
-use crate::indexer::segment_updater::save_new_metas;
+use crate::indexer::segment_updater::save_metas;
 use crate::reader::{IndexReader, IndexReaderBuilder};
 use crate::schema::{Field, FieldType, Schema};
 use crate::tokenizer::{TextAnalyzer, TokenizerManager};
@@ -47,6 +48,34 @@ fn load_metas(
        .map_err(From::from)
 }

+/// Save the index meta file.
+/// This operation is atomic :
+/// Either
+///  - it fails, in which case an error is returned,
+/// and the `meta.json` remains untouched,
+/// - it succeeds, and `meta.json` is written
+/// and flushed.
+///
+/// This method is not part of tantivy's public API
+fn save_new_metas(
+    schema: Schema,
+    index_settings: IndexSettings,
+    directory: &dyn Directory,
+) -> crate::Result<()> {
+    save_metas(
+        &IndexMeta {
+            index_settings,
+            segments: Vec::new(),
+            schema,
+            opstamp: 0u64,
+            payload: None,
+        },
+        directory,
+    )?;
+    directory.sync_directory()?;
+    Ok(())
+}
+
 /// IndexBuilder can be used to create an index.
 ///
 /// Use in conjunction with `SchemaBuilder`. Global index settings
@@ -135,6 +164,25 @@ impl IndexBuilder {
        self.create(mmap_directory)
    }

+    /// Dragons ahead!!!
+    ///
+    /// The point of this API is to let users create a simple index with a single segment
+    /// and without starting any thread.
+    ///
+    /// Do not use this method if you are not sure what you are doing.
+    ///
+    /// It expects an originally empty directory, and will not run any GC operation.
+    #[doc(hidden)]
+    pub fn single_segment_index_writer(
+        self,
+        dir: impl Into<Box<dyn Directory>>,
+        mem_budget: usize,
+    ) -> crate::Result<SingleSegmentIndexWriter> {
+        let index = self.create(dir)?;
+        let index_simple_writer = SingleSegmentIndexWriter::new(index, mem_budget)?;
+        Ok(index_simple_writer)
+    }
+
    /// Creates a new index in a temp directory.
    ///
    /// The index will use the `MMapDirectory` in a newly created directory.
@@ -580,10 +628,12 @@ impl fmt::Debug for Index {

 #[cfg(test)]
 mod tests {
+    use crate::collector::Count;
    use crate::directory::{RamDirectory, WatchCallback};
-    use crate::schema::{Field, Schema, INDEXED, TEXT};
+    use crate::query::TermQuery;
+    use crate::schema::{Field, IndexRecordOption, Schema, INDEXED, TEXT};
    use crate::tokenizer::TokenizerManager;
-    use crate::{Directory, Index, IndexBuilder, IndexReader, IndexSettings, ReloadPolicy};
+    use crate::{Directory, Index, IndexBuilder, IndexReader, IndexSettings, ReloadPolicy, Term};

    #[test]
    fn test_indexer_for_field() {
@@ -849,4 +899,28 @@ mod tests {
        );
        Ok(())
    }
+
+    #[test]
+    fn test_single_segment_index_writer() -> crate::Result<()> {
+        let mut schema_builder = Schema::builder();
+        let text_field = schema_builder.add_text_field("text", TEXT);
+        let schema = schema_builder.build();
+        let directory = RamDirectory::default();
+        let mut single_segment_index_writer = Index::builder()
+            .schema(schema)
+            .single_segment_index_writer(directory, 10_000_000)?;
+        for _ in 0..10 {
+            let doc = doc!(text_field=>"hello");
+            single_segment_index_writer.add_document(doc)?;
+        }
+        let index = single_segment_index_writer.finalize()?;
+        let searcher = index.reader()?.searcher();
+        let term_query = TermQuery::new(
+            Term::from_field_text(text_field, "hello"),
+            IndexRecordOption::Basic,
+        );
+        let count = searcher.search(&term_query, &Count)?;
+        assert_eq!(count, 10);
+        Ok(())
+    }
 }
--- a/src/core/mod.rs
+++ b/src/core/mod.rs
@@ -7,6 +7,7 @@ mod segment;
 mod segment_component;
 mod segment_id;
 mod segment_reader;
+mod single_segment_index_writer;

 use std::path::Path;

@@ -23,6 +24,7 @@ pub use self::segment::Segment;
 pub use self::segment_component::SegmentComponent;
 pub use self::segment_id::SegmentId;
 pub use self::segment_reader::SegmentReader;
+pub use self::single_segment_index_writer::SingleSegmentIndexWriter;

 /// The meta file contains all the information about the list of segments and the schema
 /// of the index.
--- a/src/core/single_segment_index_writer.rs
+++ b/src/core/single_segment_index_writer.rs
@@ -0,0 +1,47 @@
+use crate::indexer::operation::AddOperation;
+use crate::indexer::segment_updater::save_metas;
+use crate::indexer::SegmentWriter;
+use crate::{Directory, Document, Index, IndexMeta, Opstamp, Segment};
+
+#[doc(hidden)]
+pub struct SingleSegmentIndexWriter {
+    segment_writer: SegmentWriter,
+    segment: Segment,
+    opstamp: Opstamp,
+}
+
+impl SingleSegmentIndexWriter {
+    pub fn new(index: Index, mem_budget: usize) -> crate::Result<Self> {
+        let segment = index.new_segment();
+        let segment_writer = SegmentWriter::for_segment(mem_budget, segment.clone())?;
+        Ok(Self {
+            segment_writer,
+            segment,
+            opstamp: 0,
+        })
+    }
+
+    pub fn add_document(&mut self, document: Document) -> crate::Result<()> {
+        let opstamp = self.opstamp;
+        self.opstamp += 1;
+        self.segment_writer
+            .add_document(AddOperation { opstamp, document })
+    }
+
+    pub fn finalize(self) -> crate::Result<Index> {
+        let max_doc = self.segment_writer.max_doc();
+        self.segment_writer.finalize()?;
+        let segment: Segment = self.segment.with_max_doc(max_doc);
+        let index = segment.index();
+        let index_meta = IndexMeta {
+            index_settings: index.settings().clone(),
+            segments: vec![segment.meta().clone()],
+            schema: index.schema(),
+            opstamp: 0,
+            payload: None,
+        };
+        save_metas(&index_meta, index.directory())?;
+        index.directory().sync_directory()?;
+        Ok(segment.index().clone())
+    }
+}
--- a/src/fastfield/mod.rs
+++ b/src/fastfield/mod.rs
@@ -978,37 +978,20 @@ mod tests {

 #[cfg(all(test, feature = "unstable"))]
 mod bench {
-    use std::collections::HashMap;
-    use std::path::Path;
-
    use fastfield_codecs::Column;
    use test::{self, Bencher};

-    use super::tests::{generate_permutation, FIELD, SCHEMA};
+    use super::tests::generate_permutation;
    use super::*;
-    use crate::directory::{CompositeFile, Directory, RamDirectory, WritePtr};
    use crate::fastfield::tests::generate_permutation_gcd;

    #[bench]
-    fn bench_intfastfield_linear_veclookup(b: &mut Bencher) {
+    fn bench_intfastfield_jumpy_veclookup(b: &mut Bencher) {
        let permutation = generate_permutation();
+        let n = permutation.len();
        b.iter(|| {
-            let n = test::black_box(7000u32);
            let mut a = 0u64;
-            for i in (0u32..n / 7).map(|v| v * 7) {
-                a ^= permutation[i as usize];
-            }
-            a
-        });
-    }
-
-    #[bench]
-    fn bench_intfastfield_veclookup(b: &mut Bencher) {
-        let permutation = generate_permutation();
-        b.iter(|| {
-            let n = test::black_box(1000u32);
-            let mut a = 0u64;
-            for _ in 0u32..n {
+            for _ in 0..n {
                a = permutation[a as usize];
            }
            a
@@ -1016,102 +999,83 @@ mod bench {
    }

    #[bench]
-    fn bench_intfastfield_linear_fflookup(b: &mut Bencher) {
-        let path = Path::new("test");
+    fn bench_intfastfield_jumpy_fflookup(b: &mut Bencher) {
        let permutation = generate_permutation();
-        let directory: RamDirectory = RamDirectory::create();
-        {
-            let write: WritePtr = directory.open_write(Path::new("test")).unwrap();
-            let mut serializer = CompositeFastFieldSerializer::from_write(write).unwrap();
-            let mut fast_field_writers = FastFieldsWriter::from_schema(&SCHEMA);
-            for &x in &permutation {
-                fast_field_writers.add_document(&doc!(*FIELD=>x));
+        let n = permutation.len();
+        let column = DynamicFastFieldReader::from(permutation);
+        b.iter(|| {
+            let mut a = 0u64;
+            for _ in 0..n {
+                a = column.get_val(a as u64);
            }
-            fast_field_writers
-                .serialize(&mut serializer, &HashMap::new(), None)
-                .unwrap();
-            serializer.close().unwrap();
-        }
-        let file = directory.open_read(&path).unwrap();
-        {
-            let fast_fields_composite = CompositeFile::open(&file).unwrap();
-            let data = fast_fields_composite.open_read(*FIELD).unwrap();
-            let fast_field_reader = DynamicFastFieldReader::<u64>::open(data).unwrap();
-
-            b.iter(|| {
-                let n = test::black_box(7000u32);
-                let mut a = 0u64;
-                for i in (0u32..n / 7).map(|val| val * 7) {
-                    a ^= fast_field_reader.get_val(i as u64);
-                }
-                a
-            });
-        }
+            a
+        });
    }

    #[bench]
-    fn bench_intfastfield_fflookup(b: &mut Bencher) {
-        let path = Path::new("test");
+    fn bench_intfastfield_stride7_vec(b: &mut Bencher) {
        let permutation = generate_permutation();
-        let directory: RamDirectory = RamDirectory::create();
-        {
-            let write: WritePtr = directory.open_write(Path::new("test")).unwrap();
-            let mut serializer = CompositeFastFieldSerializer::from_write(write).unwrap();
-            let mut fast_field_writers = FastFieldsWriter::from_schema(&SCHEMA);
-            for &x in &permutation {
-                fast_field_writers.add_document(&doc!(*FIELD=>x));
+        let n = permutation.len();
+        b.iter(|| {
+            let mut a = 0u64;
+            for i in (0..n / 7).map(|val| val * 7) {
+                a += permutation[i as usize];
            }
-            fast_field_writers
-                .serialize(&mut serializer, &HashMap::new(), None)
-                .unwrap();
-            serializer.close().unwrap();
-        }
-        let file = directory.open_read(&path).unwrap();
-        {
-            let fast_fields_composite = CompositeFile::open(&file).unwrap();
-            let data = fast_fields_composite.open_read(*FIELD).unwrap();
-            let fast_field_reader = DynamicFastFieldReader::<u64>::open(data).unwrap();
-
-            b.iter(|| {
-                let mut a = 0u32;
-                for i in 0u64..permutation.len() as u64 {
-                    a = fast_field_reader.get_val(i) as u32;
-                }
-                a
-            });
-        }
+            a
+        });
    }

    #[bench]
-    fn bench_intfastfield_fflookup_gcd(b: &mut Bencher) {
-        let path = Path::new("test");
+    fn bench_intfastfield_stride7_fflookup(b: &mut Bencher) {
+        let permutation = generate_permutation();
+        let n = permutation.len();
+        let column = DynamicFastFieldReader::from(permutation);
+        b.iter(|| {
+            let mut a = 0u64;
+            for i in (0..n / 7).map(|val| val * 7) {
+                a += column.get_val(i as u64);
+            }
+            a
+        });
+    }
+
+    #[bench]
+    fn bench_intfastfield_scan_all_fflookup(b: &mut Bencher) {
+        let permutation = generate_permutation();
+        let n = permutation.len();
+        let column = DynamicFastFieldReader::from(permutation);
+        b.iter(|| {
+            let mut a = 0u64;
+            for i in 0u64..n as u64 {
+                a += column.get_val(i);
+            }
+            a
+        });
+    }
+
+    #[bench]
+    fn bench_intfastfield_scan_all_fflookup_gcd(b: &mut Bencher) {
        let permutation = generate_permutation_gcd();
-        let directory: RamDirectory = RamDirectory::create();
-        {
-            let write: WritePtr = directory.open_write(Path::new("test")).unwrap();
-            let mut serializer = CompositeFastFieldSerializer::from_write(write).unwrap();
-            let mut fast_field_writers = FastFieldsWriter::from_schema(&SCHEMA);
-            for &x in &permutation {
-                fast_field_writers.add_document(&doc!(*FIELD=>x));
+        let n = permutation.len();
+        let column = DynamicFastFieldReader::from(permutation);
+        b.iter(|| {
+            let mut a = 0u64;
+            for i in 0..n as u64 {
+                a += column.get_val(i);
            }
-            fast_field_writers
-                .serialize(&mut serializer, &HashMap::new(), None)
-                .unwrap();
-            serializer.close().unwrap();
-        }
-        let file = directory.open_read(&path).unwrap();
-        {
-            let fast_fields_composite = CompositeFile::open(&file).unwrap();
-            let data = fast_fields_composite.open_read(*FIELD).unwrap();
-            let fast_field_reader = DynamicFastFieldReader::<u64>::open(data).unwrap();
+            a
+        });
+    }

-            b.iter(|| {
-                let mut a = 0u32;
-                for i in 0u32..permutation.len() as u32 {
-                    a = fast_field_reader.get_val(i as u64) as u32;
-                }
-                a
-            });
-        }
+    #[bench]
+    fn bench_intfastfield_scan_all_vec(b: &mut Bencher) {
+        let permutation = generate_permutation();
+        b.iter(|| {
+            let mut a = 0u64;
+            for i in 0..permutation.len() {
+                a += permutation[i as usize] as u64;
+            }
+            a
+        });
    }
 }
--- a/src/fastfield/serializer/mod.rs
+++ b/src/fastfield/serializer/mod.rs
@@ -64,8 +64,8 @@ impl From<FastFieldCodecType> for FastFieldCodecEnableCheck {

 // use this, when this is merged and stabilized explicit_generic_args_with_impl_trait
 // https://github.com/rust-lang/rust/pull/86176
-fn codec_estimation<C: FastFieldCodec, D: Column>(
-    fastfield_accessor: &D,
+fn codec_estimation<C: FastFieldCodec>(
+    fastfield_accessor: &impl Column,
    estimations: &mut Vec<(f32, FastFieldCodecType)>,
 ) {
    if let Some(ratio) = C::estimate(fastfield_accessor) {
@@ -202,13 +202,13 @@ impl CompositeFastFieldSerializer {
        let mut estimations = vec![];

        if codec_enable_checker.is_enabled(FastFieldCodecType::Bitpacked) {
-            codec_estimation::<BitpackedCodec, _>(&fastfield_accessor, &mut estimations);
+            codec_estimation::<BitpackedCodec>(&fastfield_accessor, &mut estimations);
        }
        if codec_enable_checker.is_enabled(FastFieldCodecType::Linear) {
-            codec_estimation::<LinearCodec, _>(&fastfield_accessor, &mut estimations);
+            codec_estimation::<LinearCodec>(&fastfield_accessor, &mut estimations);
        }
        if codec_enable_checker.is_enabled(FastFieldCodecType::BlockwiseLinear) {
-            codec_estimation::<BlockwiseLinearCodec, _>(&fastfield_accessor, &mut estimations);
+            codec_estimation::<BlockwiseLinearCodec>(&fastfield_accessor, &mut estimations);
        }
        if let Some(broken_estimation) = estimations.iter().find(|estimation| estimation.0.is_nan())
        {
--- a/src/indexer/index_writer.rs
+++ b/src/indexer/index_writer.rs
@@ -174,9 +174,7 @@ fn index_documents(
    segment_updater: &mut SegmentUpdater,
    mut delete_cursor: DeleteCursor,
 ) -> crate::Result<()> {
-    let schema = segment.schema();
-
-    let mut segment_writer = SegmentWriter::for_segment(memory_budget, segment.clone(), schema)?;
+    let mut segment_writer = SegmentWriter::for_segment(memory_budget, segment.clone())?;
    for document_group in grouped_document_iterator {
        for doc in document_group {
            segment_writer.add_document(doc)?;
--- a/src/indexer/merger.rs
+++ b/src/indexer/merger.rs
@@ -133,7 +133,7 @@ impl TermOrdinalMapping {
    fn max_term_ord(&self) -> TermOrdinal {
        self.per_segment_new_term_ordinals
            .iter()
-            .flat_map(|term_ordinals| term_ordinals.iter().max())
+            .flat_map(|term_ordinals| term_ordinals.iter().max().cloned())
            .max()
            .unwrap_or_default()
    }
@@ -784,7 +784,7 @@ impl IndexMerger {
                let new_doc_id: DocId =
                    self.offsets
                        .iter()
-                        .position(|offset| offset > pos)
+                        .position(|&offset| offset > pos)
                        .expect("pos is out of bounds") as DocId
                        - 1u32;

--- a/src/indexer/segment_updater.rs
+++ b/src/indexer/segment_updater.rs
@@ -25,39 +25,10 @@ use crate::indexer::{
    DefaultMergePolicy, MergeCandidate, MergeOperation, MergePolicy, SegmentEntry,
    SegmentSerializer,
 };
-use crate::schema::Schema;
 use crate::{FutureResult, Opstamp};

 const NUM_MERGE_THREADS: usize = 4;

-/// Save the index meta file.
-/// This operation is atomic :
-/// Either
-///  - it fails, in which case an error is returned,
-/// and the `meta.json` remains untouched,
-/// - it succeeds, and `meta.json` is written
-/// and flushed.
-///
-/// This method is not part of tantivy's public API
-pub fn save_new_metas(
-    schema: Schema,
-    index_settings: IndexSettings,
-    directory: &dyn Directory,
-) -> crate::Result<()> {
-    save_metas(
-        &IndexMeta {
-            index_settings,
-            segments: Vec::new(),
-            schema,
-            opstamp: 0u64,
-            payload: None,
-        },
-        directory,
-    )?;
-    directory.sync_directory()?;
-    Ok(())
-}
-
 /// Save the index meta file.
 /// This operation is atomic:
 /// Either
@@ -67,7 +38,7 @@ pub fn save_new_metas(
 /// and flushed.
 ///
 /// This method is not part of tantivy's public API
-fn save_metas(metas: &IndexMeta, directory: &dyn Directory) -> crate::Result<()> {
+pub(crate) fn save_metas(metas: &IndexMeta, directory: &dyn Directory) -> crate::Result<()> {
    info!("save metas");
    let mut buffer = serde_json::to_vec_pretty(metas)?;
    // Just adding a new line at the end of the buffer.
--- a/src/indexer/segment_writer.rs
+++ b/src/indexer/segment_writer.rs
@@ -80,8 +80,8 @@ impl SegmentWriter {
    pub fn for_segment(
        memory_budget_in_bytes: usize,
        segment: Segment,
-        schema: Schema,
    ) -> crate::Result<SegmentWriter> {
+        let schema = segment.schema();
        let tokenizer_manager = segment.index().tokenizers().clone();
        let table_size = compute_initial_table_size(memory_budget_in_bytes)?;
        let segment_serializer = SegmentSerializer::for_segment(segment, false)?;
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -301,7 +301,7 @@ pub use self::docset::{DocSet, TERMINATED};
 pub use crate::core::{
    Executor, Index, IndexBuilder, IndexMeta, IndexSettings, IndexSortByField, InvertedIndexReader,
    Order, Searcher, SearcherGeneration, Segment, SegmentComponent, SegmentId, SegmentMeta,
-    SegmentReader,
+    SegmentReader, SingleSegmentIndexWriter,
 };
 pub use crate::directory::Directory;
 pub use crate::indexer::demuxer::*;
--- a/src/postings/mod.rs
+++ b/src/postings/mod.rs
@@ -227,7 +227,7 @@ pub mod tests {

        {
            let mut segment_writer =
-                SegmentWriter::for_segment(3_000_000, segment.clone(), schema).unwrap();
+                SegmentWriter::for_segment(3_000_000, segment.clone()).unwrap();
            {
                // checking that position works if the field has two values
                let op = AddOperation {
--- a/src/postings/postings_writer.rs
+++ b/src/postings/postings_writer.rs
@@ -116,7 +116,7 @@ pub(crate) struct IndexingPosition {
 /// and building a `Segment` in anonymous memory.
 ///
 /// `PostingsWriter` writes in a `MemoryArena`.
-pub(crate) trait PostingsWriter {
+pub(crate) trait PostingsWriter: Send + Sync {
    /// Record that a document contains a term at a given position.
    ///
    /// * doc  - the document id
--- a/src/postings/recorder.rs
+++ b/src/postings/recorder.rs
@@ -56,7 +56,7 @@ impl<'a> Iterator for VInt32Reader<'a> {
 ///   * the document id
 ///   * the term frequency
 ///   * the term positions
-pub(crate) trait Recorder: Copy + Default + 'static {
+pub(crate) trait Recorder: Copy + Default + Send + Sync + 'static {
    /// Returns the current document
    fn current_doc(&self) -> u32;
    /// Starts recording information about a new document
Author	SHA1	Message	Date
Paul Masurel	4a072e3c18	Introducing a column trait	2022-09-02 11:24:04 +09:00
Paul Masurel	84e0c75598	Bench fixing	2022-09-02 11:15:44 +09:00
Paul Masurel	08c4412d73	Adding dragon API to build index without any thread. (#1496 ) Closes #1487	2022-09-01 10:32:36 +09:00
Shikhar Bhushan	70e58adff9	`OwnedBytes` doc clarification (#1498 ) It only exposes it with the same lifetime as `&self`, which is what keeps things safe	2022-09-01 10:32:17 +09:00
PSeitz	0d1cd119e9	Merge pull request #1497 from quickwit-oss/improve_proptest custom num strategy, faster test	2022-08-31 06:25:25 -07:00
Pascal Seitz	d3dd620048	fix clippy	2022-08-31 13:13:56 +02:00
Pascal Seitz	e89c220b56	custom num strategy, faster test closes #1486 faster test with rand values	2022-08-31 12:08:44 +02:00
Paul Masurel	a451f6d60d	Minor refactoring. (#1495 )	2022-08-31 12:00:58 +09:00
PSeitz	f740ddeee3	Merge pull request #1493 from quickwit-oss/remove_vec_impl remove Column impl on Vec	2022-08-29 07:54:33 -07:00
Pascal Seitz	7a26cc9022	add VecColumn	2022-08-29 15:49:43 +02:00
Pascal Seitz	54972caa7c	remove Column impl on Vec remove Column impl on Vec to avoid function shadowing	2022-08-29 11:57:41 +02:00