diff --git a/bitpacker/src/bitpacker.rs b/bitpacker/src/bitpacker.rs index 4a92f0e7f..e022e33e9 100644 --- a/bitpacker/src/bitpacker.rs +++ b/bitpacker/src/bitpacker.rs @@ -1,4 +1,5 @@ -use std::{convert::TryInto, io}; +use std::convert::TryInto; +use std::io; pub struct BitPacker { mini_buffer: u64, diff --git a/bitpacker/src/blocked_bitpacker.rs b/bitpacker/src/blocked_bitpacker.rs index dc53d0fb1..21cc58d7f 100644 --- a/bitpacker/src/blocked_bitpacker.rs +++ b/bitpacker/src/blocked_bitpacker.rs @@ -1,12 +1,11 @@ +use super::bitpacker::BitPacker; +use super::compute_num_bits; use crate::{minmax, BitUnpacker}; -use super::{bitpacker::BitPacker, compute_num_bits}; - const BLOCK_SIZE: usize = 128; /// `BlockedBitpacker` compresses data in blocks of /// 128 elements, while keeping an index on it -/// #[derive(Debug, Clone)] pub struct BlockedBitpacker { // bitpacked blocks diff --git a/bitpacker/src/lib.rs b/bitpacker/src/lib.rs index 141fe66a5..07e3de461 100644 --- a/bitpacker/src/lib.rs +++ b/bitpacker/src/lib.rs @@ -1,8 +1,7 @@ mod bitpacker; mod blocked_bitpacker; -pub use crate::bitpacker::BitPacker; -pub use crate::bitpacker::BitUnpacker; +pub use crate::bitpacker::{BitPacker, BitUnpacker}; pub use crate::blocked_bitpacker::BlockedBitpacker; /// Computes the number of bits that will be used for bitpacking. diff --git a/common/src/bitset.rs b/common/src/bitset.rs index 54e7b477a..81bb31f55 100644 --- a/common/src/bitset.rs +++ b/common/src/bitset.rs @@ -1,8 +1,8 @@ -use ownedbytes::OwnedBytes; use std::convert::TryInto; use std::io::Write; -use std::u64; -use std::{fmt, io}; +use std::{fmt, io, u64}; + +use ownedbytes::OwnedBytes; #[derive(Clone, Copy, Eq, PartialEq)] pub struct TinySet(u64); @@ -187,7 +187,6 @@ fn num_buckets(max_val: u32) -> u32 { impl BitSet { /// serialize a `BitSet`. - /// pub fn serialize(&self, writer: &mut T) -> io::Result<()> { writer.write_all(self.max_value.to_le_bytes().as_ref())?; for tinyset in self.tinysets.iter().cloned() { @@ -353,7 +352,6 @@ impl ReadOnlyBitSet { } /// Iterate the tinyset on the fly from serialized data. - /// #[inline] fn iter_tinysets(&self) -> impl Iterator + '_ { self.data.chunks_exact(8).map(move |chunk| { @@ -363,7 +361,6 @@ impl ReadOnlyBitSet { } /// Iterate over the positions of the elements. - /// #[inline] pub fn iter(&self) -> impl Iterator + '_ { self.iter_tinysets() @@ -415,14 +412,14 @@ impl<'a> From<&'a BitSet> for ReadOnlyBitSet { #[cfg(test)] mod tests { - use super::BitSet; - use super::ReadOnlyBitSet; - use super::TinySet; + use std::collections::HashSet; + use ownedbytes::OwnedBytes; use rand::distributions::Bernoulli; use rand::rngs::StdRng; use rand::{Rng, SeedableRng}; - use std::collections::HashSet; + + use super::{BitSet, ReadOnlyBitSet, TinySet}; #[test] fn test_read_serialized_bitset_full_multi() { @@ -710,10 +707,10 @@ mod tests { #[cfg(all(test, feature = "unstable"))] mod bench { - use super::BitSet; - use super::TinySet; use test; + use super::{BitSet, TinySet}; + #[bench] fn bench_tinyset_pop(b: &mut test::Bencher) { b.iter(|| { diff --git a/common/src/lib.rs b/common/src/lib.rs index 20da26fbc..259e5659e 100644 --- a/common/src/lib.rs +++ b/common/src/lib.rs @@ -104,11 +104,12 @@ pub fn u64_to_f64(val: u64) -> f64 { #[cfg(test)] pub mod test { - use super::{f64_to_u64, i64_to_u64, u64_to_f64, u64_to_i64}; - use super::{BinarySerializable, FixedSize}; - use proptest::prelude::*; use std::f64; + use proptest::prelude::*; + + use super::{f64_to_u64, i64_to_u64, u64_to_f64, u64_to_i64, BinarySerializable, FixedSize}; + fn test_i64_converter_helper(val: i64) { assert_eq!(u64_to_i64(i64_to_u64(val)), val); } @@ -157,10 +158,10 @@ pub mod test { #[test] fn test_f64_order() { assert!(!(f64_to_u64(f64::NEG_INFINITY)..f64_to_u64(f64::INFINITY)) - .contains(&f64_to_u64(f64::NAN))); //nan is not a number - assert!(f64_to_u64(1.5) > f64_to_u64(1.0)); //same exponent, different mantissa - assert!(f64_to_u64(2.0) > f64_to_u64(1.0)); //same mantissa, different exponent - assert!(f64_to_u64(2.0) > f64_to_u64(1.5)); //different exponent and mantissa + .contains(&f64_to_u64(f64::NAN))); // nan is not a number + assert!(f64_to_u64(1.5) > f64_to_u64(1.0)); // same exponent, different mantissa + assert!(f64_to_u64(2.0) > f64_to_u64(1.0)); // same mantissa, different exponent + assert!(f64_to_u64(2.0) > f64_to_u64(1.5)); // different exponent and mantissa assert!(f64_to_u64(1.0) > f64_to_u64(-1.0)); // pos > neg assert!(f64_to_u64(-1.5) < f64_to_u64(-1.0)); assert!(f64_to_u64(-2.0) < f64_to_u64(1.0)); diff --git a/common/src/serialize.rs b/common/src/serialize.rs index e608a7bf9..4d48e16a7 100644 --- a/common/src/serialize.rs +++ b/common/src/serialize.rs @@ -1,10 +1,9 @@ -use crate::Endianness; -use crate::VInt; +use std::io::{Read, Write}; +use std::{fmt, io}; + use byteorder::{ReadBytesExt, WriteBytesExt}; -use std::fmt; -use std::io; -use std::io::Read; -use std::io::Write; + +use crate::{Endianness, VInt}; /// Trait for a simple binary serialization. pub trait BinarySerializable: fmt::Debug + Sized { @@ -202,8 +201,7 @@ impl BinarySerializable for String { #[cfg(test)] pub mod test { - use super::VInt; - use super::*; + use super::{VInt, *}; use crate::serialize::BinarySerializable; pub fn fixed_size_test() { let mut buffer = Vec::new(); diff --git a/common/src/vint.rs b/common/src/vint.rs index 994352644..ec1d166e2 100644 --- a/common/src/vint.rs +++ b/common/src/vint.rs @@ -1,8 +1,9 @@ -use super::BinarySerializable; -use byteorder::{ByteOrder, LittleEndian}; use std::io; -use std::io::Read; -use std::io::Write; +use std::io::{Read, Write}; + +use byteorder::{ByteOrder, LittleEndian}; + +use super::BinarySerializable; /// Wrapper over a `u64` that serializes as a variable int. #[derive(Clone, Copy, Debug, Eq, PartialEq)] @@ -174,9 +175,7 @@ impl BinarySerializable for VInt { #[cfg(test)] mod tests { - use super::serialize_vint_u32; - use super::BinarySerializable; - use super::VInt; + use super::{serialize_vint_u32, BinarySerializable, VInt}; fn aux_test_vint(val: u64) { let mut v = [14u8; 10]; diff --git a/common/src/writer.rs b/common/src/writer.rs index 731d6afee..20f56221d 100644 --- a/common/src/writer.rs +++ b/common/src/writer.rs @@ -54,7 +54,8 @@ impl TerminatingWrite for CountingWriter { } } -/// Struct used to prevent from calling [`terminate_ref`](trait.TerminatingWrite.html#tymethod.terminate_ref) directly +/// Struct used to prevent from calling +/// [`terminate_ref`](trait.TerminatingWrite.html#tymethod.terminate_ref) directly /// /// The point is that while the type is public, it cannot be built by anyone /// outside of this module. @@ -64,9 +65,7 @@ pub struct AntiCallToken(()); pub trait TerminatingWrite: Write { /// Indicate that the writer will no longer be used. Internally call terminate_ref. fn terminate(mut self) -> io::Result<()> - where - Self: Sized, - { + where Self: Sized { self.terminate_ref(AntiCallToken(())) } @@ -97,9 +96,10 @@ impl<'a> TerminatingWrite for &'a mut Vec { #[cfg(test)] mod test { - use super::CountingWriter; use std::io::Write; + use super::CountingWriter; + #[test] fn test_counting_writer() { let buffer: Vec = vec![]; diff --git a/examples/basic_search.rs b/examples/basic_search.rs index 768d76dca..cbdb36ce8 100644 --- a/examples/basic_search.rs +++ b/examples/basic_search.rs @@ -91,8 +91,8 @@ fn main() -> tantivy::Result<()> { old_man_doc.add_text(title, "The Old Man and the Sea"); old_man_doc.add_text( body, - "He was an old man who fished alone in a skiff in the Gulf Stream and \ - he had gone eighty-four days now without taking a fish.", + "He was an old man who fished alone in a skiff in the Gulf Stream and he had gone \ + eighty-four days now without taking a fish.", ); // ... and add it to the `IndexWriter`. diff --git a/examples/custom_collector.rs b/examples/custom_collector.rs index a2532328a..7bdc9d06b 100644 --- a/examples/custom_collector.rs +++ b/examples/custom_collector.rs @@ -12,8 +12,7 @@ use tantivy::collector::{Collector, SegmentCollector}; use tantivy::fastfield::{DynamicFastFieldReader, FastFieldReader}; use tantivy::query::QueryParser; -use tantivy::schema::Field; -use tantivy::schema::{Schema, FAST, INDEXED, TEXT}; +use tantivy::schema::{Field, Schema, FAST, INDEXED, TEXT}; use tantivy::{doc, Index, Score, SegmentReader}; #[derive(Default)] diff --git a/examples/deleting_updating_documents.rs b/examples/deleting_updating_documents.rs index ea35fbd91..57891d1bc 100644 --- a/examples/deleting_updating_documents.rs +++ b/examples/deleting_updating_documents.rs @@ -56,8 +56,9 @@ fn main() -> tantivy::Result<()> { // If it is `text`, let's make sure to keep it `raw` and let's avoid // running any text processing on it. // This is done by associating this field to the tokenizer named `raw`. - // Rather than building our [`TextOptions`](//docs.rs/tantivy/~0/tantivy/schema/struct.TextOptions.html) manually, - // We use the `STRING` shortcut. `STRING` stands for indexed (without term frequency or positions) + // Rather than building our + // [`TextOptions`](//docs.rs/tantivy/~0/tantivy/schema/struct.TextOptions.html) manually, We + // use the `STRING` shortcut. `STRING` stands for indexed (without term frequency or positions) // and untokenized. // // Because we also want to be able to see this `id` in our returned documents, diff --git a/examples/faceted_search_with_tweaked_score.rs b/examples/faceted_search_with_tweaked_score.rs index de2e2e323..07cea50c5 100644 --- a/examples/faceted_search_with_tweaked_score.rs +++ b/examples/faceted_search_with_tweaked_score.rs @@ -1,9 +1,9 @@ use std::collections::HashSet; + use tantivy::collector::TopDocs; -use tantivy::doc; use tantivy::query::BooleanQuery; use tantivy::schema::*; -use tantivy::{DocId, Index, Score, SegmentReader}; +use tantivy::{doc, DocId, Index, Score, SegmentReader}; fn main() -> tantivy::Result<()> { let mut schema_builder = Schema::builder(); @@ -87,7 +87,7 @@ fn main() -> tantivy::Result<()> { .unwrap() .get_first(title) .unwrap() - .text() + .as_text() .unwrap() .to_owned() }) diff --git a/examples/iterating_docs_and_positions.rs b/examples/iterating_docs_and_positions.rs index 94849c0c8..1cb3dad56 100644 --- a/examples/iterating_docs_and_positions.rs +++ b/examples/iterating_docs_and_positions.rs @@ -52,11 +52,11 @@ fn main() -> tantivy::Result<()> { let term_the = Term::from_field_text(title, "the"); // This segment posting object is like a cursor over the documents matching the term. - // The `IndexRecordOption` arguments tells tantivy we will be interested in both term frequencies - // and positions. + // The `IndexRecordOption` arguments tells tantivy we will be interested in both term + // frequencies and positions. // - // If you don't need all this information, you may get better performance by decompressing less - // information. + // If you don't need all this information, you may get better performance by decompressing + // less information. if let Some(mut segment_postings) = inverted_index.read_postings(&term_the, IndexRecordOption::WithFreqsAndPositions)? { @@ -109,11 +109,11 @@ fn main() -> tantivy::Result<()> { let inverted_index = segment_reader.inverted_index(title)?; // This segment posting object is like a cursor over the documents matching the term. - // The `IndexRecordOption` arguments tells tantivy we will be interested in both term frequencies - // and positions. + // The `IndexRecordOption` arguments tells tantivy we will be interested in both term + // frequencies and positions. // - // If you don't need all this information, you may get better performance by decompressing less - // information. + // If you don't need all this information, you may get better performance by decompressing + // less information. if let Some(mut block_segment_postings) = inverted_index.read_block_postings(&term_the, IndexRecordOption::Basic)? { diff --git a/examples/multiple_producer.rs b/examples/multiple_producer.rs index f3f6e1b4e..02807698f 100644 --- a/examples/multiple_producer.rs +++ b/examples/multiple_producer.rs @@ -28,6 +28,7 @@ use std::sync::{Arc, RwLock}; use std::thread; use std::time::Duration; + use tantivy::schema::{Schema, STORED, TEXT}; use tantivy::{doc, Index, IndexWriter, Opstamp, TantivyError}; @@ -90,7 +91,8 @@ fn main() -> tantivy::Result<()> { // # In the main thread, we commit 10 times, once every 500ms. for _ in 0..10 { let opstamp: Opstamp = { - // Committing or rollbacking on the other hand requires write lock. This will block other threads. + // Committing or rollbacking on the other hand requires write lock. This will block + // other threads. let mut index_writer_wlock = index_writer.write().unwrap(); index_writer_wlock.commit()? }; diff --git a/examples/snippet.rs b/examples/snippet.rs index 23bf1c26a..4d38ade70 100644 --- a/examples/snippet.rs +++ b/examples/snippet.rs @@ -57,7 +57,10 @@ fn main() -> tantivy::Result<()> { let doc = searcher.doc(doc_address)?; let snippet = snippet_generator.snippet_from_doc(&doc); println!("Document score {}:", score); - println!("title: {}", doc.get_first(title).unwrap().text().unwrap()); + println!( + "title: {}", + doc.get_first(title).unwrap().as_text().unwrap() + ); println!("snippet: {}", snippet.to_html()); println!("custom highlighting: {}", highlight(snippet)); } diff --git a/examples/warmer.rs b/examples/warmer.rs index ae4fa299c..d18d9796d 100644 --- a/examples/warmer.rs +++ b/examples/warmer.rs @@ -6,8 +6,10 @@ use tantivy::collector::TopDocs; use tantivy::fastfield::FastFieldReader; use tantivy::query::QueryParser; use tantivy::schema::{Field, Schema, FAST, TEXT}; -use tantivy::{doc, DocAddress, DocId, Index, IndexReader, SegmentReader}; -use tantivy::{Opstamp, Searcher, SearcherGeneration, SegmentId, Warmer}; +use tantivy::{ + doc, DocAddress, DocId, Index, IndexReader, Opstamp, Searcher, SearcherGeneration, SegmentId, + SegmentReader, Warmer, +}; // This example shows how warmers can be used to // load a values from an external sources using the Warmer API. @@ -90,7 +92,6 @@ impl Warmer for DynamicPriceColumn { /// This map represents a map (ProductId -> Price) /// /// In practise, it could be fetching things from an external service, like a SQL table. -/// #[derive(Default, Clone)] pub struct ExternalPriceTable { prices: Arc>>, diff --git a/fastfield_codecs/benches/bench.rs b/fastfield_codecs/benches/bench.rs index ac4eeaf2a..768037d00 100644 --- a/fastfield_codecs/benches/bench.rs +++ b/fastfield_codecs/benches/bench.rs @@ -4,14 +4,14 @@ extern crate test; #[cfg(test)] mod tests { - use fastfield_codecs::{ - bitpacked::{BitpackedFastFieldReader, BitpackedFastFieldSerializer}, - linearinterpol::{LinearInterpolFastFieldReader, LinearInterpolFastFieldSerializer}, - multilinearinterpol::{ - MultiLinearInterpolFastFieldReader, MultiLinearInterpolFastFieldSerializer, - }, - *, + use fastfield_codecs::bitpacked::{BitpackedFastFieldReader, BitpackedFastFieldSerializer}; + use fastfield_codecs::linearinterpol::{ + LinearInterpolFastFieldReader, LinearInterpolFastFieldSerializer, }; + use fastfield_codecs::multilinearinterpol::{ + MultiLinearInterpolFastFieldReader, MultiLinearInterpolFastFieldSerializer, + }; + use fastfield_codecs::*; fn get_data() -> Vec { let mut data: Vec<_> = (100..55000_u64) diff --git a/fastfield_codecs/src/bitpacked.rs b/fastfield_codecs/src/bitpacked.rs index 60e69ff28..e09f73303 100644 --- a/fastfield_codecs/src/bitpacked.rs +++ b/fastfield_codecs/src/bitpacked.rs @@ -1,13 +1,9 @@ -use crate::FastFieldCodecReader; -use crate::FastFieldCodecSerializer; -use crate::FastFieldDataAccess; -use crate::FastFieldStats; -use common::BinarySerializable; use std::io::{self, Write}; -use tantivy_bitpacker::compute_num_bits; -use tantivy_bitpacker::BitPacker; -use tantivy_bitpacker::BitUnpacker; +use common::BinarySerializable; +use tantivy_bitpacker::{compute_num_bits, BitPacker, BitUnpacker}; + +use crate::{FastFieldCodecReader, FastFieldCodecSerializer, FastFieldDataAccess, FastFieldStats}; /// Depending on the field type, a different /// fast field is required. diff --git a/fastfield_codecs/src/lib.rs b/fastfield_codecs/src/lib.rs index 0c9af38a2..1204dd6b0 100644 --- a/fastfield_codecs/src/lib.rs +++ b/fastfield_codecs/src/lib.rs @@ -53,7 +53,8 @@ pub trait FastFieldCodecSerializer { pub trait FastFieldDataAccess { /// Return the value associated to the given position. /// - /// Whenever possible use the Iterator passed to the fastfield creation instead, for performance reasons. + /// Whenever possible use the Iterator passed to the fastfield creation instead, for performance + /// reasons. /// /// # Panics /// @@ -82,12 +83,10 @@ impl FastFieldDataAccess for Vec { #[cfg(test)] mod tests { - use crate::{ - bitpacked::{BitpackedFastFieldReader, BitpackedFastFieldSerializer}, - linearinterpol::{LinearInterpolFastFieldReader, LinearInterpolFastFieldSerializer}, - multilinearinterpol::{ - MultiLinearInterpolFastFieldReader, MultiLinearInterpolFastFieldSerializer, - }, + use crate::bitpacked::{BitpackedFastFieldReader, BitpackedFastFieldSerializer}; + use crate::linearinterpol::{LinearInterpolFastFieldReader, LinearInterpolFastFieldSerializer}; + use crate::multilinearinterpol::{ + MultiLinearInterpolFastFieldReader, MultiLinearInterpolFastFieldSerializer, }; pub fn create_and_validate( diff --git a/fastfield_codecs/src/linearinterpol.rs b/fastfield_codecs/src/linearinterpol.rs index c8b410925..b4d0bb480 100644 --- a/fastfield_codecs/src/linearinterpol.rs +++ b/fastfield_codecs/src/linearinterpol.rs @@ -1,15 +1,10 @@ -use crate::FastFieldCodecReader; -use crate::FastFieldCodecSerializer; -use crate::FastFieldDataAccess; -use crate::FastFieldStats; use std::io::{self, Read, Write}; use std::ops::Sub; -use tantivy_bitpacker::compute_num_bits; -use tantivy_bitpacker::BitPacker; -use common::BinarySerializable; -use common::FixedSize; -use tantivy_bitpacker::BitUnpacker; +use common::{BinarySerializable, FixedSize}; +use tantivy_bitpacker::{compute_num_bits, BitPacker, BitUnpacker}; + +use crate::{FastFieldCodecReader, FastFieldCodecSerializer, FastFieldDataAccess, FastFieldStats}; /// Depending on the field type, a different /// fast field is required. @@ -137,7 +132,7 @@ impl FastFieldCodecSerializer for LinearInterpolFastFieldSerializer { // will be offset to 0 offset = offset.max(calculated_value - actual_value); } else { - //positive value no offset reuqired + // positive value no offset reuqired rel_positive_max = rel_positive_max.max(actual_value - calculated_value); } } @@ -171,7 +166,7 @@ impl FastFieldCodecSerializer for LinearInterpolFastFieldSerializer { stats: FastFieldStats, ) -> bool { if stats.num_vals < 3 { - return false; //disable compressor for this case + return false; // disable compressor for this case } // On serialisation the offset is added to the actual value. // We need to make sure this won't run into overflow calculation issues. @@ -211,8 +206,8 @@ impl FastFieldCodecSerializer for LinearInterpolFastFieldSerializer { .max() .unwrap_or(0); - // the theory would be that we don't have the actual max_distance, but we are close within 50% - // threshold. + // the theory would be that we don't have the actual max_distance, but we are close within + // 50% threshold. // It is multiplied by 2 because in a log case scenario the line would be as much above as // below. So the offset would = max_distance // diff --git a/fastfield_codecs/src/main.rs b/fastfield_codecs/src/main.rs index 197dfa20f..bfce54ca4 100644 --- a/fastfield_codecs/src/main.rs +++ b/fastfield_codecs/src/main.rs @@ -1,10 +1,8 @@ #[macro_use] extern crate prettytable; -use fastfield_codecs::{ - linearinterpol::LinearInterpolFastFieldSerializer, - multilinearinterpol::MultiLinearInterpolFastFieldSerializer, FastFieldCodecSerializer, - FastFieldStats, -}; +use fastfield_codecs::linearinterpol::LinearInterpolFastFieldSerializer; +use fastfield_codecs::multilinearinterpol::MultiLinearInterpolFastFieldSerializer; +use fastfield_codecs::{FastFieldCodecSerializer, FastFieldStats}; use prettytable::{Cell, Row, Table}; fn main() { @@ -24,7 +22,7 @@ fn main() { ); results.push(res); - //let best_estimation_codec = results + // let best_estimation_codec = results //.iter() //.min_by(|res1, res2| res1.partial_cmp(&res2).unwrap()) //.unwrap(); @@ -73,7 +71,7 @@ pub fn get_codec_test_data_sets() -> Vec<(Vec, &'static str)> { current_cumulative }) .collect::>(); - //let data = (1..=200000_u64).map(|num| num + num).collect::>(); + // let data = (1..=200000_u64).map(|num| num + num).collect::>(); data_and_names.push((data, "Monotonically increasing concave")); let mut current_cumulative = 0; diff --git a/fastfield_codecs/src/multilinearinterpol.rs b/fastfield_codecs/src/multilinearinterpol.rs index 10b2d4e78..bc11c951b 100644 --- a/fastfield_codecs/src/multilinearinterpol.rs +++ b/fastfield_codecs/src/multilinearinterpol.rs @@ -1,30 +1,22 @@ -/*! +//! MultiLinearInterpol compressor uses linear interpolation to guess a values and stores the +//! offset, but in blocks of 512. +//! +//! With a CHUNK_SIZE of 512 and 29 byte metadata per block, we get a overhead for metadata of 232 / +//! 512 = 0,45 bits per element. The additional space required per element in a block is the the +//! maximum deviation of the linear interpolation estimation function. +//! +//! E.g. if the maximum deviation of an element is 12, all elements cost 4bits. +//! +//! Size per block: +//! Num Elements * Maximum Deviation from Interpolation + 29 Byte Metadata -MultiLinearInterpol compressor uses linear interpolation to guess a values and stores the offset, but in blocks of 512. - -With a CHUNK_SIZE of 512 and 29 byte metadata per block, we get a overhead for metadata of 232 / 512 = 0,45 bits per element. -The additional space required per element in a block is the the maximum deviation of the linear interpolation estimation function. - -E.g. if the maximum deviation of an element is 12, all elements cost 4bits. - -Size per block: -Num Elements * Maximum Deviation from Interpolation + 29 Byte Metadata - -*/ - -use crate::FastFieldCodecReader; -use crate::FastFieldCodecSerializer; -use crate::FastFieldDataAccess; -use crate::FastFieldStats; -use common::CountingWriter; use std::io::{self, Read, Write}; use std::ops::Sub; -use tantivy_bitpacker::compute_num_bits; -use tantivy_bitpacker::BitPacker; -use common::BinarySerializable; -use common::DeserializeFrom; -use tantivy_bitpacker::BitUnpacker; +use common::{BinarySerializable, CountingWriter, DeserializeFrom}; +use tantivy_bitpacker::{compute_num_bits, BitPacker, BitUnpacker}; + +use crate::{FastFieldCodecReader, FastFieldCodecSerializer, FastFieldDataAccess, FastFieldStats}; const CHUNK_SIZE: u64 = 512; @@ -252,11 +244,11 @@ impl FastFieldCodecSerializer for MultiLinearInterpolFastFieldSerializer { ); if calculated_value > actual_value { // negative value we need to apply an offset - // we ignore negative values in the max value calculation, because negative values - // will be offset to 0 + // we ignore negative values in the max value calculation, because negative + // values will be offset to 0 offset = offset.max(calculated_value - actual_value); } else { - //positive value no offset reuqired + // positive value no offset reuqired rel_positive_max = rel_positive_max.max(actual_value - calculated_value); } } @@ -350,8 +342,8 @@ impl FastFieldCodecSerializer for MultiLinearInterpolFastFieldSerializer { .unwrap(); // Estimate one block and extrapolate the cost to all blocks. - // the theory would be that we don't have the actual max_distance, but we are close within 50% - // threshold. + // the theory would be that we don't have the actual max_distance, but we are close within + // 50% threshold. // It is multiplied by 2 because in a log case scenario the line would be as much above as // below. So the offset would = max_distance // diff --git a/ownedbytes/src/lib.rs b/ownedbytes/src/lib.rs index 9d52f3d70..6bb0ee311 100644 --- a/ownedbytes/src/lib.rs +++ b/ownedbytes/src/lib.rs @@ -1,11 +1,11 @@ #![allow(clippy::return_self_not_must_use)] -use stable_deref_trait::StableDeref; use std::convert::TryInto; -use std::mem; use std::ops::{Deref, Range}; use std::sync::Arc; -use std::{fmt, io}; +use std::{fmt, io, mem}; + +use stable_deref_trait::StableDeref; /// An OwnedBytes simply wraps an object that owns a slice of data and exposes /// this data as a static slice. @@ -102,7 +102,6 @@ impl OwnedBytes { } /// Drops the left most `advance_len` bytes. - /// #[inline] pub fn advance(&mut self, advance_len: usize) { self.data = &self.data[advance_len..] @@ -163,8 +162,7 @@ impl PartialEq for OwnedBytes { } impl<'a, T: ?Sized> PartialEq<&'a T> for OwnedBytes -where - OwnedBytes: PartialEq, +where OwnedBytes: PartialEq { fn eq(&self, other: &&'a T) -> bool { *self == **other diff --git a/query-grammar/src/query_grammar.rs b/query-grammar/src/query_grammar.rs index 6250eaba3..ee9f295ae 100644 --- a/query-grammar/src/query_grammar.rs +++ b/query-grammar/src/query_grammar.rs @@ -1,17 +1,20 @@ -use super::user_input_ast::{UserInputAst, UserInputBound, UserInputLeaf, UserInputLiteral}; -use crate::Occur; +use combine::error::StringStreamError; use combine::parser::char::{char, digit, space, spaces, string}; +use combine::parser::combinator::recognize; use combine::parser::range::{take_while, take_while1}; use combine::parser::repeat::escaped; use combine::parser::Parser; use combine::{ attempt, choice, eof, many, many1, one_of, optional, parser, satisfy, skip_many1, value, }; -use combine::{error::StringStreamError, parser::combinator::recognize}; use once_cell::sync::Lazy; use regex::Regex; -// Note: '-' char is only forbidden at the beginning of a field name, would be clearer to add it to special characters. +use super::user_input_ast::{UserInputAst, UserInputBound, UserInputLeaf, UserInputLiteral}; +use crate::Occur; + +// Note: '-' char is only forbidden at the beginning of a field name, would be clearer to add it to +// special characters. const SPECIAL_CHARS: &[char] = &[ '+', '^', '`', ':', '{', '}', '"', '[', ']', '(', ')', '~', '!', '\\', '*', ' ', ]; @@ -363,9 +366,10 @@ mod test { type TestParseResult = Result<(), StringStreamError>; - use super::*; use combine::parser::Parser; + use super::*; + pub fn nearly_equals(a: f64, b: f64) -> bool { (a - b).abs() < 0.0005 * (a + b).abs() } diff --git a/rustfmt.toml b/rustfmt.toml index f56c0c3ef..71487e460 100644 --- a/rustfmt.toml +++ b/rustfmt.toml @@ -1 +1,7 @@ -use_try_shorthand = true +comment_width = 120 +format_strings = true +group_imports = "StdExternalCrate" +imports_granularity = "Module" +normalize_comments = true +where_single_line = true +wrap_comments = true diff --git a/src/collector/count_collector.rs b/src/collector/count_collector.rs index 1a1e207ac..075a4f36b 100644 --- a/src/collector/count_collector.rs +++ b/src/collector/count_collector.rs @@ -1,9 +1,6 @@ use super::Collector; use crate::collector::SegmentCollector; -use crate::DocId; -use crate::Score; -use crate::SegmentOrdinal; -use crate::SegmentReader; +use crate::{DocId, Score, SegmentOrdinal, SegmentReader}; /// `CountCollector` collector only counts how many /// documents match the query. @@ -80,8 +77,7 @@ impl SegmentCollector for SegmentCountCollector { #[cfg(test)] mod tests { use super::{Count, SegmentCountCollector}; - use crate::collector::Collector; - use crate::collector::SegmentCollector; + use crate::collector::{Collector, SegmentCollector}; #[test] fn test_count_collect_does_not_requires_scoring() { diff --git a/src/collector/custom_score_top_collector.rs b/src/collector/custom_score_top_collector.rs index 0a804cf5c..d645004ad 100644 --- a/src/collector/custom_score_top_collector.rs +++ b/src/collector/custom_score_top_collector.rs @@ -8,8 +8,7 @@ pub(crate) struct CustomScoreTopCollector { } impl CustomScoreTopCollector -where - TScore: Clone + PartialOrd, +where TScore: Clone + PartialOrd { pub(crate) fn new( custom_scorer: TCustomScorer, @@ -114,8 +113,7 @@ where } impl CustomSegmentScorer for F -where - F: 'static + FnMut(DocId) -> TScore, +where F: 'static + FnMut(DocId) -> TScore { fn score(&mut self, doc: DocId) -> TScore { (self)(doc) diff --git a/src/collector/docset_collector.rs b/src/collector/docset_collector.rs index 875d376ca..a27a39418 100644 --- a/src/collector/docset_collector.rs +++ b/src/collector/docset_collector.rs @@ -1,8 +1,7 @@ use std::collections::HashSet; -use crate::{DocAddress, DocId, Score}; - use super::{Collector, SegmentCollector}; +use crate::{DocAddress, DocId, Score}; /// Collectors that returns the set of DocAddress that matches the query. /// diff --git a/src/collector/facet_collector.rs b/src/collector/facet_collector.rs index 42a18d68b..e2ef47f98 100644 --- a/src/collector/facet_collector.rs +++ b/src/collector/facet_collector.rs @@ -1,21 +1,14 @@ -use crate::collector::Collector; -use crate::collector::SegmentCollector; -use crate::fastfield::FacetReader; -use crate::schema::Facet; -use crate::schema::Field; -use crate::DocId; -use crate::Score; -use crate::SegmentOrdinal; -use crate::SegmentReader; use std::cmp::Ordering; -use std::collections::btree_map; -use std::collections::BTreeMap; -use std::collections::BTreeSet; -use std::collections::BinaryHeap; +use std::collections::{btree_map, BTreeMap, BTreeSet, BinaryHeap}; use std::iter::Peekable; use std::ops::Bound; use std::{u64, usize}; +use crate::collector::{Collector, SegmentCollector}; +use crate::fastfield::FacetReader; +use crate::schema::{Facet, Field}; +use crate::{DocId, Score, SegmentOrdinal, SegmentReader}; + struct Hit<'a> { count: u64, facet: &'a Facet, @@ -240,9 +233,7 @@ impl FacetCollector { /// If you need the correct number of unique documents for two such facets, /// just add them in separate `FacetCollector`. pub fn add_facet(&mut self, facet_from: T) - where - Facet: From, - { + where Facet: From { let facet = Facet::from(facet_from); for old_facet in &self.facets { assert!( @@ -402,9 +393,7 @@ impl FacetCounts { /// Returns an iterator over all of the facet count pairs inside this result. /// See the documentation for [FacetCollector] for a usage example. pub fn get(&self, facet_from: T) -> FacetChildIterator<'_> - where - Facet: From, - { + where Facet: From { let facet = Facet::from(facet_from); let left_bound = Bound::Excluded(facet.clone()); let right_bound = if facet.is_root() { @@ -423,9 +412,7 @@ impl FacetCounts { /// Returns a vector of top `k` facets with their counts, sorted highest-to-lowest by counts. /// See the documentation for [FacetCollector] for a usage example. pub fn top_k(&self, facet: T, k: usize) -> Vec<(&Facet, u64)> - where - Facet: From, - { + where Facet: From { let mut heap = BinaryHeap::with_capacity(k); let mut it = self.get(facet); @@ -458,16 +445,18 @@ impl FacetCounts { #[cfg(test)] mod tests { + use std::iter; + + use rand::distributions::Uniform; + use rand::prelude::SliceRandom; + use rand::{thread_rng, Rng}; + use super::{FacetCollector, FacetCounts}; use crate::collector::Count; use crate::core::Index; use crate::query::{AllQuery, QueryParser, TermQuery}; use crate::schema::{Document, Facet, FacetOptions, Field, IndexRecordOption, Schema}; use crate::Term; - use rand::distributions::Uniform; - use rand::prelude::SliceRandom; - use rand::{thread_rng, Rng}; - use std::iter; #[test] fn test_facet_collector_drilldown() -> crate::Result<()> { @@ -522,8 +511,9 @@ mod tests { } #[test] - #[should_panic(expected = "Tried to add a facet which is a descendant of \ - an already added facet.")] + #[should_panic( + expected = "Tried to add a facet which is a descendant of an already added facet." + )] fn test_misused_facet_collector() { let mut facet_collector = FacetCollector::for_field(Field::from_field_id(0)); facet_collector.add_facet(Facet::from("/country")); @@ -700,13 +690,14 @@ mod tests { #[cfg(all(test, feature = "unstable"))] mod bench { + use rand::seq::SliceRandom; + use rand::thread_rng; + use test::Bencher; + use crate::collector::FacetCollector; use crate::query::AllQuery; use crate::schema::{Facet, Schema, INDEXED}; use crate::Index; - use rand::seq::SliceRandom; - use rand::thread_rng; - use test::Bencher; #[bench] fn bench_facet_collector(b: &mut Bencher) { diff --git a/src/collector/filter_collector_wrapper.rs b/src/collector/filter_collector_wrapper.rs index fb82a96f8..b1dbaaa20 100644 --- a/src/collector/filter_collector_wrapper.rs +++ b/src/collector/filter_collector_wrapper.rs @@ -17,7 +17,8 @@ use crate::schema::Field; use crate::{Score, SegmentReader, TantivyError}; /// The `FilterCollector` filters docs using a fast field value and a predicate. -/// Only the documents for which the predicate returned "true" will be passed on to the next collector. +/// Only the documents for which the predicate returned "true" will be passed on to the next +/// collector. /// /// ```rust /// use tantivy::collector::{TopDocs, FilterCollector}; @@ -58,8 +59,7 @@ use crate::{Score, SegmentReader, TantivyError}; /// # } /// ``` pub struct FilterCollector -where - TPredicate: 'static + Clone, +where TPredicate: 'static + Clone { field: Field, collector: TCollector, diff --git a/src/collector/histogram_collector.rs b/src/collector/histogram_collector.rs index a5297d704..8685b4aca 100644 --- a/src/collector/histogram_collector.rs +++ b/src/collector/histogram_collector.rs @@ -1,8 +1,9 @@ +use fastdivide::DividerU64; + use crate::collector::{Collector, SegmentCollector}; use crate::fastfield::{DynamicFastFieldReader, FastFieldReader, FastValue}; use crate::schema::{Field, Type}; use crate::{DocId, Score}; -use fastdivide::DividerU64; /// Histogram builds an histogram of the values of a fastfield for the /// collected DocSet. @@ -36,8 +37,8 @@ impl HistogramCollector { /// - `bucket_width`: the length of the interval that is associated to each buckets. /// - `num_buckets`: The overall number of buckets. /// - /// Together, this parameters define a partition of `[min_value, min_value + num_buckets * bucket_width)` - /// into `num_buckets` intervals of width bucket that we call `bucket`. + /// Together, this parameters define a partition of `[min_value, min_value + num_buckets * + /// bucket_width)` into `num_buckets` intervals of width bucket that we call `bucket`. /// /// # Disclaimer /// This function panics if the field given is of type f64. @@ -147,12 +148,13 @@ fn add_vecs(mut vals_list: Vec>, len: usize) -> Vec { #[cfg(test)] mod tests { + use fastdivide::DividerU64; + use query::AllQuery; + use super::{add_vecs, HistogramCollector, HistogramComputer}; use crate::chrono::{TimeZone, Utc}; use crate::schema::{Schema, FAST}; use crate::{doc, query, Index}; - use fastdivide::DividerU64; - use query::AllQuery; #[test] fn test_add_histograms_simple() { diff --git a/src/collector/mod.rs b/src/collector/mod.rs index 3fa2b5fd7..3c2c42ca1 100644 --- a/src/collector/mod.rs +++ b/src/collector/mod.rs @@ -1,95 +1,90 @@ -/*! +//! # Collectors +//! +//! Collectors define the information you want to extract from the documents matching the queries. +//! In tantivy jargon, we call this information your search "fruit". +//! +//! Your fruit could for instance be : +//! - [the count of matching documents](./struct.Count.html) +//! - [the top 10 documents, by relevancy or by a fast field](./struct.TopDocs.html) +//! - [facet counts](./struct.FacetCollector.html) +//! +//! At one point in your code, you will trigger the actual search operation by calling +//! [the `search(...)` method of your `Searcher` object](../struct.Searcher.html#method.search). +//! This call will look like this. +//! +//! ```verbatim +//! let fruit = searcher.search(&query, &collector)?; +//! ``` +//! +//! Here the type of fruit is actually determined as an associated type of the collector +//! (`Collector::Fruit`). +//! +//! +//! # Combining several collectors +//! +//! A rich search experience often requires to run several collectors on your search query. +//! For instance, +//! - selecting the top-K products matching your query +//! - counting the matching documents +//! - computing several facets +//! - computing statistics about the matching product prices +//! +//! A simple and efficient way to do that is to pass your collectors as one tuple. +//! The resulting `Fruit` will then be a typed tuple with each collector's original fruits +//! in their respective position. +//! +//! ```rust +//! # use tantivy::schema::*; +//! # use tantivy::*; +//! # use tantivy::query::*; +//! use tantivy::collector::{Count, TopDocs}; +//! # +//! # fn main() -> tantivy::Result<()> { +//! # let mut schema_builder = Schema::builder(); +//! # let title = schema_builder.add_text_field("title", TEXT); +//! # let schema = schema_builder.build(); +//! # let index = Index::create_in_ram(schema); +//! # let mut index_writer = index.writer(3_000_000)?; +//! # index_writer.add_document(doc!( +//! # title => "The Name of the Wind", +//! # ))?; +//! # index_writer.add_document(doc!( +//! # title => "The Diary of Muadib", +//! # ))?; +//! # index_writer.commit()?; +//! # let reader = index.reader()?; +//! # let searcher = reader.searcher(); +//! # let query_parser = QueryParser::for_index(&index, vec![title]); +//! # let query = query_parser.parse_query("diary")?; +//! let (doc_count, top_docs): (usize, Vec<(Score, DocAddress)>) = +//! searcher.search(&query, &(Count, TopDocs::with_limit(2)))?; +//! # Ok(()) +//! # } +//! ``` +//! +//! The `Collector` trait is implemented for up to 4 collectors. +//! If you have more than 4 collectors, you can either group them into +//! tuples of tuples `(a,(b,(c,d)))`, or rely on [`MultiCollector`](./struct.MultiCollector.html). +//! +//! # Combining several collectors dynamically +//! +//! Combining collectors into a tuple is a zero-cost abstraction: everything +//! happens as if you had manually implemented a single collector +//! combining all of our features. +//! +//! Unfortunately it requires you to know at compile time your collector types. +//! If on the other hand, the collectors depend on some query parameter, +//! you can rely on `MultiCollector`'s. +//! +//! +//! # Implementing your own collectors. +//! +//! See the `custom_collector` example. -# Collectors - -Collectors define the information you want to extract from the documents matching the queries. -In tantivy jargon, we call this information your search "fruit". - -Your fruit could for instance be : -- [the count of matching documents](./struct.Count.html) -- [the top 10 documents, by relevancy or by a fast field](./struct.TopDocs.html) -- [facet counts](./struct.FacetCollector.html) - -At one point in your code, you will trigger the actual search operation by calling -[the `search(...)` method of your `Searcher` object](../struct.Searcher.html#method.search). -This call will look like this. - -```verbatim -let fruit = searcher.search(&query, &collector)?; -``` - -Here the type of fruit is actually determined as an associated type of the collector (`Collector::Fruit`). - - -# Combining several collectors - -A rich search experience often requires to run several collectors on your search query. -For instance, -- selecting the top-K products matching your query -- counting the matching documents -- computing several facets -- computing statistics about the matching product prices - -A simple and efficient way to do that is to pass your collectors as one tuple. -The resulting `Fruit` will then be a typed tuple with each collector's original fruits -in their respective position. - -```rust -# use tantivy::schema::*; -# use tantivy::*; -# use tantivy::query::*; -use tantivy::collector::{Count, TopDocs}; -# -# fn main() -> tantivy::Result<()> { -# let mut schema_builder = Schema::builder(); -# let title = schema_builder.add_text_field("title", TEXT); -# let schema = schema_builder.build(); -# let index = Index::create_in_ram(schema); -# let mut index_writer = index.writer(3_000_000)?; -# index_writer.add_document(doc!( -# title => "The Name of the Wind", -# ))?; -# index_writer.add_document(doc!( -# title => "The Diary of Muadib", -# ))?; -# index_writer.commit()?; -# let reader = index.reader()?; -# let searcher = reader.searcher(); -# let query_parser = QueryParser::for_index(&index, vec![title]); -# let query = query_parser.parse_query("diary")?; -let (doc_count, top_docs): (usize, Vec<(Score, DocAddress)>) = - searcher.search(&query, &(Count, TopDocs::with_limit(2)))?; -# Ok(()) -# } -``` - -The `Collector` trait is implemented for up to 4 collectors. -If you have more than 4 collectors, you can either group them into -tuples of tuples `(a,(b,(c,d)))`, or rely on [`MultiCollector`](./struct.MultiCollector.html). - -# Combining several collectors dynamically - -Combining collectors into a tuple is a zero-cost abstraction: everything -happens as if you had manually implemented a single collector -combining all of our features. - -Unfortunately it requires you to know at compile time your collector types. -If on the other hand, the collectors depend on some query parameter, -you can rely on `MultiCollector`'s. - - -# Implementing your own collectors. - -See the `custom_collector` example. - -*/ - -use crate::DocId; -use crate::Score; -use crate::SegmentOrdinal; -use crate::SegmentReader; use downcast_rs::impl_downcast; +use crate::{DocId, Score, SegmentOrdinal, SegmentReader}; + mod count_collector; pub use self::count_collector::Count; @@ -111,8 +106,7 @@ mod tweak_score_top_collector; pub use self::tweak_score_top_collector::{ScoreSegmentTweaker, ScoreTweaker}; mod facet_collector; -pub use self::facet_collector::FacetCollector; -pub use self::facet_collector::FacetCounts; +pub use self::facet_collector::{FacetCollector, FacetCounts}; use crate::query::Weight; mod docset_collector; diff --git a/src/collector/multi_collector.rs b/src/collector/multi_collector.rs index da9c8fac3..9be2f2c71 100644 --- a/src/collector/multi_collector.rs +++ b/src/collector/multi_collector.rs @@ -1,14 +1,10 @@ -use super::Collector; -use super::SegmentCollector; -use crate::collector::Fruit; -use crate::DocId; -use crate::Score; -use crate::SegmentOrdinal; -use crate::SegmentReader; -use crate::TantivyError; use std::marker::PhantomData; use std::ops::Deref; +use super::{Collector, SegmentCollector}; +use crate::collector::Fruit; +use crate::{DocId, Score, SegmentOrdinal, SegmentReader, TantivyError}; + pub struct MultiFruit { sub_fruits: Vec>>, } @@ -104,7 +100,8 @@ impl FruitHandle { /// /// If the type of the collectors is known, you can just group yours collectors /// in a tuple. See the -/// [Combining several collectors section of the collector documentation](./index.html#combining-several-collectors). +/// [Combining several collectors section of the collector +/// documentation](./index.html#combining-several-collectors). /// /// ```rust /// use tantivy::collector::{Count, TopDocs, MultiCollector}; @@ -248,10 +245,8 @@ mod tests { use super::*; use crate::collector::{Count, TopDocs}; use crate::query::TermQuery; - use crate::schema::IndexRecordOption; - use crate::schema::{Schema, TEXT}; - use crate::Index; - use crate::Term; + use crate::schema::{IndexRecordOption, Schema, TEXT}; + use crate::{Index, Term}; #[test] fn test_multi_collector() -> crate::Result<()> { diff --git a/src/collector/tests.rs b/src/collector/tests.rs index 0c41e855c..0fd21b054 100644 --- a/src/collector/tests.rs +++ b/src/collector/tests.rs @@ -1,21 +1,13 @@ -use super::*; -use crate::core::SegmentReader; -use crate::fastfield::BytesFastFieldReader; -use crate::fastfield::DynamicFastFieldReader; -use crate::fastfield::FastFieldReader; -use crate::schema::Field; -use crate::DocId; -use crate::Score; -use crate::SegmentOrdinal; -use crate::{DocAddress, Document, Searcher}; - -use crate::collector::{Count, FilterCollector, TopDocs}; -use crate::query::{AllQuery, QueryParser}; -use crate::schema::{Schema, FAST, TEXT}; -use crate::DateTime; -use crate::{doc, Index}; use std::str::FromStr; +use super::*; +use crate::collector::{Count, FilterCollector, TopDocs}; +use crate::core::SegmentReader; +use crate::fastfield::{BytesFastFieldReader, DynamicFastFieldReader, FastFieldReader}; +use crate::query::{AllQuery, QueryParser}; +use crate::schema::{Field, Schema, FAST, TEXT}; +use crate::{doc, DateTime, DocAddress, DocId, Document, Index, Score, Searcher, SegmentOrdinal}; + pub const TEST_COLLECTOR_WITH_SCORE: TestCollector = TestCollector { compute_score: true, }; diff --git a/src/collector/top_collector.rs b/src/collector/top_collector.rs index 14b1d5c67..34dbc8d33 100644 --- a/src/collector/top_collector.rs +++ b/src/collector/top_collector.rs @@ -1,11 +1,9 @@ -use crate::DocAddress; -use crate::DocId; -use crate::SegmentOrdinal; -use crate::SegmentReader; use std::cmp::Ordering; use std::collections::BinaryHeap; use std::marker::PhantomData; +use crate::{DocAddress, DocId, SegmentOrdinal, SegmentReader}; + /// Contains a feature (field, score, etc.) of a document along with the document address. /// /// It has a custom implementation of `PartialOrd` that reverses the order. This is because the @@ -62,8 +60,7 @@ pub(crate) struct TopCollector { } impl TopCollector -where - T: PartialOrd + Clone, +where T: PartialOrd + Clone { /// Creates a top collector, with a number of documents equal to "limit". /// @@ -322,9 +319,10 @@ mod tests { #[cfg(all(test, feature = "unstable"))] mod bench { - use super::TopSegmentCollector; use test::Bencher; + use super::TopSegmentCollector; + #[bench] fn bench_top_segment_collector_collect_not_at_capacity(b: &mut Bencher) { let mut top_collector = TopSegmentCollector::new(0, 400); diff --git a/src/collector/top_score_collector.rs b/src/collector/top_score_collector.rs index 99bfba5fc..89c871854 100644 --- a/src/collector/top_score_collector.rs +++ b/src/collector/top_score_collector.rs @@ -1,21 +1,18 @@ +use std::collections::BinaryHeap; +use std::fmt; +use std::marker::PhantomData; + use super::Collector; -use crate::collector::top_collector::{ComparableDoc, TopCollector}; +use crate::collector::custom_score_top_collector::CustomScoreTopCollector; +use crate::collector::top_collector::{ComparableDoc, TopCollector, TopSegmentCollector}; use crate::collector::tweak_score_top_collector::TweakedScoreTopCollector; use crate::collector::{ CustomScorer, CustomSegmentScorer, ScoreSegmentTweaker, ScoreTweaker, SegmentCollector, }; -use crate::fastfield::{DynamicFastFieldReader, FastFieldReader}; +use crate::fastfield::{DynamicFastFieldReader, FastFieldReader, FastValue}; use crate::query::Weight; use crate::schema::Field; -use crate::DocAddress; -use crate::DocId; -use crate::Score; -use crate::SegmentOrdinal; -use crate::SegmentReader; -use crate::{collector::custom_score_top_collector::CustomScoreTopCollector, fastfield::FastValue}; -use crate::{collector::top_collector::TopSegmentCollector, TantivyError}; -use std::fmt; -use std::{collections::BinaryHeap, marker::PhantomData}; +use crate::{DocAddress, DocId, Score, SegmentOrdinal, SegmentReader, TantivyError}; struct FastFieldConvertCollector< TCollector: Collector>, @@ -217,11 +214,12 @@ impl TopDocs { /// Set top-K to rank documents by a given fast field. /// - /// If the field is not a fast or does not exist, this method returns successfully (it is not aware of any schema). - /// An error will be returned at the moment of search. + /// If the field is not a fast or does not exist, this method returns successfully (it is not + /// aware of any schema). An error will be returned at the moment of search. /// - /// If the field is a FAST field but not a u64 field, search will return successfully but it will return - /// returns a monotonic u64-representation (ie. the order is still correct) of the requested field type. + /// If the field is a FAST field but not a u64 field, search will return successfully but it + /// will return returns a monotonic u64-representation (ie. the order is still correct) of + /// the requested field type. /// /// # Example /// @@ -296,14 +294,15 @@ impl TopDocs { /// Set top-K to rank documents by a given fast field. /// - /// If the field is not a fast field, or its field type does not match the generic type, this method does not panic, - /// but an explicit error will be returned at the moment of collection. + /// If the field is not a fast field, or its field type does not match the generic type, this + /// method does not panic, but an explicit error will be returned at the moment of + /// collection. /// /// Note that this method is a generic. The requested fast field type will be often /// inferred in your code by the rust compiler. /// - /// Implementation-wise, for performance reason, tantivy will manipulate the u64 representation of your fast - /// field until the last moment. + /// Implementation-wise, for performance reason, tantivy will manipulate the u64 representation + /// of your fast field until the last moment. /// /// # Example /// @@ -715,10 +714,7 @@ mod tests { use crate::collector::Collector; use crate::query::{AllQuery, Query, QueryParser}; use crate::schema::{Field, Schema, FAST, STORED, TEXT}; - use crate::Index; - use crate::IndexWriter; - use crate::Score; - use crate::{DocAddress, DocId, SegmentReader}; + use crate::{DocAddress, DocId, Index, IndexWriter, Score, SegmentReader}; fn make_index() -> crate::Result { let mut schema_builder = Schema::builder(); diff --git a/src/collector/tweak_score_top_collector.rs b/src/collector/tweak_score_top_collector.rs index d524e0a00..1a81e7361 100644 --- a/src/collector/tweak_score_top_collector.rs +++ b/src/collector/tweak_score_top_collector.rs @@ -1,7 +1,6 @@ use crate::collector::top_collector::{TopCollector, TopSegmentCollector}; use crate::collector::{Collector, SegmentCollector}; -use crate::DocAddress; -use crate::{DocId, Result, Score, SegmentReader}; +use crate::{DocAddress, DocId, Result, Score, SegmentReader}; pub(crate) struct TweakedScoreTopCollector { score_tweaker: TScoreTweaker, @@ -9,8 +8,7 @@ pub(crate) struct TweakedScoreTopCollector { } impl TweakedScoreTopCollector -where - TScore: Clone + PartialOrd, +where TScore: Clone + PartialOrd { pub fn new( score_tweaker: TScoreTweaker, @@ -118,8 +116,7 @@ where } impl ScoreSegmentTweaker for F -where - F: 'static + FnMut(DocId, Score) -> TScore, +where F: 'static + FnMut(DocId, Score) -> TScore { fn score(&mut self, doc: DocId, score: Score) -> TScore { (self)(doc, score) diff --git a/src/core/executor.rs b/src/core/executor.rs index 8ac39a7eb..5f0b930aa 100644 --- a/src/core/executor.rs +++ b/src/core/executor.rs @@ -57,7 +57,11 @@ impl Executor { let (idx, arg) = arg_with_idx; let fruit = f(arg); if let Err(err) = fruit_sender.send((idx, fruit)) { - error!("Failed to send search task. It probably means all search threads have panicked. {:?}", err); + error!( + "Failed to send search task. It probably means all search \ + threads have panicked. {:?}", + err + ); } }); } diff --git a/src/core/index.rs b/src/core/index.rs index fd88fa708..b75536be1 100644 --- a/src/core/index.rs +++ b/src/core/index.rs @@ -1,35 +1,27 @@ -use super::{segment::Segment, IndexSettings}; -use crate::core::Executor; -use crate::core::IndexMeta; -use crate::core::SegmentId; -use crate::core::SegmentMeta; -use crate::core::SegmentMetaInventory; -use crate::core::META_FILEPATH; -use crate::directory::error::OpenReadError; -use crate::directory::ManagedDirectory; -#[cfg(feature = "mmap")] -use crate::directory::MmapDirectory; -use crate::directory::INDEX_WRITER_LOCK; -use crate::directory::{Directory, RamDirectory}; -use crate::error::DataCorruption; -use crate::error::TantivyError; -use crate::indexer::index_writer::{HEAP_SIZE_MIN, MAX_NUM_THREAD}; -use crate::indexer::segment_updater::save_new_metas; -use crate::reader::IndexReader; -use crate::reader::IndexReaderBuilder; -use crate::schema::Field; -use crate::schema::FieldType; -use crate::schema::Schema; -use crate::tokenizer::{TextAnalyzer, TokenizerManager}; -use crate::IndexWriter; use std::collections::HashSet; use std::fmt; - #[cfg(feature = "mmap")] use std::path::Path; use std::path::PathBuf; use std::sync::Arc; +use super::segment::Segment; +use super::IndexSettings; +use crate::core::{ + Executor, IndexMeta, SegmentId, SegmentMeta, SegmentMetaInventory, META_FILEPATH, +}; +use crate::directory::error::OpenReadError; +#[cfg(feature = "mmap")] +use crate::directory::MmapDirectory; +use crate::directory::{Directory, ManagedDirectory, RamDirectory, INDEX_WRITER_LOCK}; +use crate::error::{DataCorruption, TantivyError}; +use crate::indexer::index_writer::{HEAP_SIZE_MIN, MAX_NUM_THREAD}; +use crate::indexer::segment_updater::save_new_metas; +use crate::reader::{IndexReader, IndexReaderBuilder}; +use crate::schema::{Field, FieldType, Schema}; +use crate::tokenizer::{TextAnalyzer, TokenizerManager}; +use crate::IndexWriter; + fn load_metas( directory: &dyn Directory, inventory: &SegmentMetaInventory, @@ -78,7 +70,6 @@ fn load_metas( /// let schema = schema_builder.build(); /// let settings = IndexSettings{sort_by_field: Some(IndexSortByField{field:"number".to_string(), order:Order::Asc}), ..Default::default()}; /// let index = Index::builder().schema(schema).settings(settings).create_in_ram(); -/// /// ``` pub struct IndexBuilder { schema: Option, @@ -416,10 +407,9 @@ impl Index { TantivyError::LockFailure( err, Some( - "Failed to acquire index lock. If you are using \ - a regular directory, this means there is already an \ - `IndexWriter` working on this `Directory`, in this process \ - or in a different process." + "Failed to acquire index lock. If you are using a regular directory, this \ + means there is already an `IndexWriter` working on this `Directory`, in \ + this process or in a different process." .to_string(), ), ) @@ -462,13 +452,11 @@ impl Index { } /// Accessor to the index settings - /// pub fn settings(&self) -> &IndexSettings { &self.settings } /// Accessor to the index settings - /// pub fn settings_mut(&mut self) -> &mut IndexSettings { &mut self.settings } @@ -556,15 +544,9 @@ impl fmt::Debug for Index { #[cfg(test)] mod tests { - use crate::schema::Field; - use crate::schema::{Schema, INDEXED, TEXT}; - use crate::IndexReader; - use crate::ReloadPolicy; - use crate::{ - directory::{RamDirectory, WatchCallback}, - IndexSettings, - }; - use crate::{Directory, Index}; + use crate::directory::{RamDirectory, WatchCallback}; + use crate::schema::{Field, Schema, INDEXED, TEXT}; + use crate::{Directory, Index, IndexReader, IndexSettings, ReloadPolicy}; #[test] fn test_indexer_for_field() { @@ -673,10 +655,12 @@ mod tests { #[cfg(feature = "mmap")] mod mmap_specific { + use std::path::PathBuf; + + use tempfile::TempDir; + use super::*; use crate::Directory; - use std::path::PathBuf; - use tempfile::TempDir; #[test] fn test_index_on_commit_reload_policy_mmap() -> crate::Result<()> { diff --git a/src/core/index_meta.rs b/src/core/index_meta.rs index 796db6078..812222893 100644 --- a/src/core/index_meta.rs +++ b/src/core/index_meta.rs @@ -1,12 +1,16 @@ -use super::SegmentComponent; -use crate::schema::Schema; -use crate::Opstamp; -use crate::{core::SegmentId, store::Compressor}; -use crate::{Inventory, TrackedObject}; -use serde::{Deserialize, Serialize}; +use std::collections::HashSet; +use std::fmt; use std::path::PathBuf; -use std::{collections::HashSet, sync::atomic::AtomicBool}; -use std::{fmt, sync::Arc}; +use std::sync::atomic::AtomicBool; +use std::sync::Arc; + +use serde::{Deserialize, Serialize}; + +use super::SegmentComponent; +use crate::core::SegmentId; +use crate::schema::Schema; +use crate::store::Compressor; +use crate::{Inventory, Opstamp, TrackedObject}; #[derive(Clone, Debug, Serialize, Deserialize)] struct DeleteMeta { @@ -282,7 +286,6 @@ impl Order { /// * the searchable segments, /// * the index `docstamp` /// * the schema -/// #[derive(Clone, Serialize)] pub struct IndexMeta { /// `IndexSettings` to configure index options. @@ -370,10 +373,8 @@ impl fmt::Debug for IndexMeta { mod tests { use super::IndexMeta; - use crate::{ - schema::{Schema, TEXT}, - IndexSettings, IndexSortByField, Order, - }; + use crate::schema::{Schema, TEXT}; + use crate::{IndexSettings, IndexSortByField, Order}; #[test] fn test_serialize_metas() { diff --git a/src/core/inverted_index_reader.rs b/src/core/inverted_index_reader.rs index e710e8ff9..d1fb791f6 100644 --- a/src/core/inverted_index_reader.rs +++ b/src/core/inverted_index_reader.rs @@ -1,13 +1,12 @@ use std::io; +use common::BinarySerializable; + use crate::directory::FileSlice; use crate::positions::PositionReader; -use crate::postings::TermInfo; -use crate::postings::{BlockSegmentPostings, SegmentPostings}; -use crate::schema::IndexRecordOption; -use crate::schema::Term; +use crate::postings::{BlockSegmentPostings, SegmentPostings, TermInfo}; +use crate::schema::{IndexRecordOption, Term}; use crate::termdict::TermDictionary; -use common::BinarySerializable; /// The inverted index reader is in charge of accessing /// the inverted index associated to a specific field. diff --git a/src/core/mod.rs b/src/core/mod.rs index dc73d2cba..6ebb65247 100644 --- a/src/core/mod.rs +++ b/src/core/mod.rs @@ -8,6 +8,10 @@ mod segment_component; mod segment_id; mod segment_reader; +use std::path::Path; + +use once_cell::sync::Lazy; + pub use self::executor::Executor; pub use self::index::{Index, IndexBuilder}; pub use self::index_meta::{ @@ -20,9 +24,6 @@ pub use self::segment_component::SegmentComponent; pub use self::segment_id::SegmentId; pub use self::segment_reader::SegmentReader; -use once_cell::sync::Lazy; -use std::path::Path; - /// The meta file contains all the information about the list of segments and the schema /// of the index. pub static META_FILEPATH: Lazy<&'static Path> = Lazy::new(|| Path::new("meta.json")); diff --git a/src/core/searcher.rs b/src/core/searcher.rs index 55028d8c8..37bafd35e 100644 --- a/src/core/searcher.rs +++ b/src/core/searcher.rs @@ -1,21 +1,14 @@ -use crate::collector::Collector; -use crate::core::Executor; -use crate::core::SegmentReader; -use crate::query::Query; -use crate::schema::Document; -use crate::schema::Schema; -use crate::schema::Term; -use crate::space_usage::SearcherSpaceUsage; -use crate::store::StoreReader; -use crate::DocAddress; -use crate::Index; -use crate::Opstamp; -use crate::SegmentId; -use crate::TrackedObject; - use std::collections::BTreeMap; use std::{fmt, io}; +use crate::collector::Collector; +use crate::core::{Executor, SegmentReader}; +use crate::query::Query; +use crate::schema::{Document, Schema, Term}; +use crate::space_usage::SearcherSpaceUsage; +use crate::store::StoreReader; +use crate::{DocAddress, Index, Opstamp, SegmentId, TrackedObject}; + /// Identifies the searcher generation accessed by a [Searcher]. /// /// While this might seem redundant, a [SearcherGeneration] contains @@ -69,7 +62,6 @@ impl SearcherGeneration { /// /// It guarantees that the `Segment` will not be removed before /// the destruction of the `Searcher`. -/// pub struct Searcher { schema: Schema, index: Index, diff --git a/src/core/segment.rs b/src/core/segment.rs index 10158b196..c82295d12 100644 --- a/src/core/segment.rs +++ b/src/core/segment.rs @@ -1,15 +1,13 @@ -use super::SegmentComponent; -use crate::core::Index; -use crate::core::SegmentId; -use crate::core::SegmentMeta; -use crate::directory::error::{OpenReadError, OpenWriteError}; -use crate::directory::Directory; -use crate::directory::{FileSlice, WritePtr}; -use crate::schema::Schema; -use crate::Opstamp; use std::fmt; use std::path::PathBuf; +use super::SegmentComponent; +use crate::core::{Index, SegmentId, SegmentMeta}; +use crate::directory::error::{OpenReadError, OpenWriteError}; +use crate::directory::{Directory, FileSlice, WritePtr}; +use crate::schema::Schema; +use crate::Opstamp; + /// A segment is a piece of the index. #[derive(Clone)] pub struct Segment { diff --git a/src/core/segment_id.rs b/src/core/segment_id.rs index c46a6228d..eb5bec5f6 100644 --- a/src/core/segment_id.rs +++ b/src/core/segment_id.rs @@ -1,14 +1,14 @@ use std::cmp::{Ord, Ordering}; +use std::error::Error; use std::fmt; -use uuid::Uuid; +use std::str::FromStr; +#[cfg(test)] +use std::sync::atomic; #[cfg(test)] use once_cell::sync::Lazy; use serde::{Deserialize, Serialize}; -use std::error::Error; -use std::str::FromStr; -#[cfg(test)] -use std::sync::atomic; +use uuid::Uuid; /// Uuid identifying a segment. /// diff --git a/src/core/segment_reader.rs b/src/core/segment_reader.rs index 547232058..63e99d84f 100644 --- a/src/core/segment_reader.rs +++ b/src/core/segment_reader.rs @@ -1,28 +1,19 @@ -use crate::core::InvertedIndexReader; -use crate::core::Segment; -use crate::core::SegmentComponent; -use crate::core::SegmentId; -use crate::directory::CompositeFile; -use crate::directory::FileSlice; +use std::collections::HashMap; +use std::sync::{Arc, RwLock}; +use std::{fmt, io}; + +use fail::fail_point; + +use crate::core::{InvertedIndexReader, Segment, SegmentComponent, SegmentId}; +use crate::directory::{CompositeFile, FileSlice}; use crate::error::DataCorruption; -use crate::fastfield::intersect_alive_bitsets; -use crate::fastfield::AliveBitSet; -use crate::fastfield::FacetReader; -use crate::fastfield::FastFieldReaders; +use crate::fastfield::{intersect_alive_bitsets, AliveBitSet, FacetReader, FastFieldReaders}; use crate::fieldnorm::{FieldNormReader, FieldNormReaders}; -use crate::schema::FieldType; -use crate::schema::Schema; -use crate::schema::{Field, IndexRecordOption}; +use crate::schema::{Field, FieldType, IndexRecordOption, Schema}; use crate::space_usage::SegmentSpaceUsage; use crate::store::StoreReader; use crate::termdict::TermDictionary; -use crate::DocId; -use crate::Opstamp; -use fail::fail_point; -use std::fmt; -use std::sync::Arc; -use std::sync::RwLock; -use std::{collections::HashMap, io}; +use crate::{DocId, Opstamp}; /// Entry point to access all of the datastructures of the `Segment` /// @@ -130,7 +121,8 @@ impl SegmentReader { self.fieldnorm_readers.get_field(field)?.ok_or_else(|| { let field_name = self.schema.get_field_name(field); let err_msg = format!( - "Field norm not found for field {:?}. Was the field set to record norm during indexing?", + "Field norm not found for field {:?}. Was the field set to record norm during \ + indexing?", field_name ); crate::TantivyError::SchemaError(err_msg) @@ -259,19 +251,24 @@ impl SegmentReader { let record_option = record_option_opt.unwrap(); let postings_file = postings_file_opt.unwrap(); - let termdict_file: FileSlice = self.termdict_composite.open_read(field) - .ok_or_else(|| - DataCorruption::comment_only(format!("Failed to open field {:?}'s term dictionary in the composite file. Has the schema been modified?", field_entry.name())) - )?; - - let positions_file = self - .positions_composite - .open_read(field) - .ok_or_else(|| { - let error_msg = format!("Failed to open field {:?}'s positions in the composite file. Has the schema been modified?", field_entry.name()); - DataCorruption::comment_only(error_msg) + let termdict_file: FileSlice = + self.termdict_composite.open_read(field).ok_or_else(|| { + DataCorruption::comment_only(format!( + "Failed to open field {:?}'s term dictionary in the composite file. Has the \ + schema been modified?", + field_entry.name() + )) })?; + let positions_file = self.positions_composite.open_read(field).ok_or_else(|| { + let error_msg = format!( + "Failed to open field {:?}'s positions in the composite file. Has the schema been \ + modified?", + field_entry.name() + ); + DataCorruption::comment_only(error_msg) + })?; + let inv_idx_reader = Arc::new(InvertedIndexReader::new( TermDictionary::open(termdict_file)?, postings_file, diff --git a/src/directory/composite_file.rs b/src/directory/composite_file.rs index 6d542609b..7743620e1 100644 --- a/src/directory/composite_file.rs +++ b/src/directory/composite_file.rs @@ -1,17 +1,14 @@ -use crate::directory::FileSlice; -use crate::directory::{TerminatingWrite, WritePtr}; -use crate::schema::Field; -use crate::space_usage::FieldUsage; -use crate::space_usage::PerFieldSpaceUsage; -use common::BinarySerializable; -use common::CountingWriter; -use common::HasLen; -use common::VInt; use std::collections::HashMap; use std::io::{self, Read, Write}; use std::iter::ExactSizeIterator; use std::ops::Range; +use common::{BinarySerializable, CountingWriter, HasLen, VInt}; + +use crate::directory::{FileSlice, TerminatingWrite, WritePtr}; +use crate::schema::Field; +use crate::space_usage::{FieldUsage, PerFieldSpaceUsage}; + #[derive(Eq, PartialEq, Hash, Copy, Ord, PartialOrd, Clone, Debug)] pub struct FileAddr { field: Field, @@ -186,13 +183,14 @@ impl CompositeFile { #[cfg(test)] mod test { + use std::io::Write; + use std::path::Path; + + use common::{BinarySerializable, VInt}; + use super::{CompositeFile, CompositeWrite}; use crate::directory::{Directory, RamDirectory}; use crate::schema::Field; - use common::BinarySerializable; - use common::VInt; - use std::io::Write; - use std::path::Path; #[test] fn test_composite_file() -> crate::Result<()> { diff --git a/src/directory/directory.rs b/src/directory/directory.rs index dc7170c8e..4cf2a7ed0 100644 --- a/src/directory/directory.rs +++ b/src/directory/directory.rs @@ -1,18 +1,12 @@ -use crate::directory::directory_lock::Lock; -use crate::directory::error::LockError; -use crate::directory::error::{DeleteError, OpenReadError, OpenWriteError}; -use crate::directory::WatchHandle; -use crate::directory::{FileHandle, WatchCallback}; -use crate::directory::{FileSlice, WritePtr}; -use std::fmt; -use std::io; use std::io::Write; -use std::marker::Send; -use std::marker::Sync; -use std::path::Path; -use std::path::PathBuf; -use std::thread; +use std::marker::{Send, Sync}; +use std::path::{Path, PathBuf}; use std::time::Duration; +use std::{fmt, io, thread}; + +use crate::directory::directory_lock::Lock; +use crate::directory::error::{DeleteError, LockError, OpenReadError, OpenWriteError}; +use crate::directory::{FileHandle, FileSlice, WatchCallback, WatchHandle, WritePtr}; /// Retry the logic of acquiring locks is pretty simple. /// We just retry `n` times after a given `duratio`, both @@ -233,8 +227,7 @@ pub trait DirectoryClone { } impl DirectoryClone for T -where - T: 'static + Directory + Clone, +where T: 'static + Directory + Clone { fn box_clone(&self) -> Box { Box::new(self.clone()) diff --git a/src/directory/directory_lock.rs b/src/directory/directory_lock.rs index 714548394..49cc2e595 100644 --- a/src/directory/directory_lock.rs +++ b/src/directory/directory_lock.rs @@ -1,6 +1,7 @@ -use once_cell::sync::Lazy; use std::path::PathBuf; +use once_cell::sync::Lazy; + /// A directory lock. /// /// A lock is associated to a specific path and some @@ -11,7 +12,6 @@ use std::path::PathBuf; /// - [META_LOCK] /// /// Check out these locks documentation for more information. -/// #[derive(Debug)] pub struct Lock { /// The lock needs to be associated with its own file `path`. diff --git a/src/directory/error.rs b/src/directory/error.rs index 6ef0b324b..4bb273ce0 100644 --- a/src/directory/error.rs +++ b/src/directory/error.rs @@ -1,15 +1,17 @@ -use crate::Version; -use std::fmt; -use std::io; use std::path::PathBuf; +use std::{fmt, io}; + +use crate::Version; /// Error while trying to acquire a directory lock. #[derive(Debug, Error)] pub enum LockError { /// Failed to acquired a lock as it is already held by another /// client. - /// - In the context of a blocking lock, this means the lock was not released within some `timeout` period. - /// - In the context of a non-blocking lock, this means the lock was busy at the moment of the call. + /// - In the context of a blocking lock, this means the lock was not released within some + /// `timeout` period. + /// - In the context of a non-blocking lock, this means the lock was busy at the moment of the + /// call. #[error("Could not acquire lock as it is already held, possibly by a different process.")] LockBusy, /// Trying to acquire a lock failed with an `IoError` diff --git a/src/directory/file_slice.rs b/src/directory/file_slice.rs index 36d032eed..9c18f1eb5 100644 --- a/src/directory/file_slice.rs +++ b/src/directory/file_slice.rs @@ -1,11 +1,11 @@ +use std::ops::{Deref, Range}; +use std::sync::{Arc, Weak}; +use std::{fmt, io}; + +use common::HasLen; use stable_deref_trait::StableDeref; use crate::directory::OwnedBytes; -use common::HasLen; -use std::fmt; -use std::ops::Range; -use std::sync::{Arc, Weak}; -use std::{io, ops::Deref}; pub type ArcBytes = Arc + Send + Sync + 'static>; pub type WeakArcBytes = Weak + Send + Sync + 'static>; @@ -33,8 +33,7 @@ impl FileHandle for &'static [u8] { } impl From for FileSlice -where - B: StableDeref + Deref + 'static + Send + Sync, +where B: StableDeref + Deref + 'static + Send + Sync { fn from(bytes: B) -> FileSlice { FileSlice::new(Box::new(OwnedBytes::new(bytes))) @@ -44,7 +43,6 @@ where /// Logical slice of read only file in tantivy. /// /// It can be cloned and sliced cheaply. -/// #[derive(Clone)] pub struct FileSlice { data: Arc, @@ -172,10 +170,12 @@ impl HasLen for FileSlice { #[cfg(test)] mod tests { - use super::{FileHandle, FileSlice}; - use common::HasLen; use std::io; + use common::HasLen; + + use super::{FileHandle, FileSlice}; + #[test] fn test_file_slice() -> io::Result<()> { let file_slice = FileSlice::new(Box::new(b"abcdef".as_ref())); diff --git a/src/directory/file_watcher.rs b/src/directory/file_watcher.rs index c907b41ec..ddf384a0c 100644 --- a/src/directory/file_watcher.rs +++ b/src/directory/file_watcher.rs @@ -1,13 +1,13 @@ -use crate::directory::{WatchCallback, WatchCallbackList, WatchHandle}; -use crc32fast::Hasher; -use std::fs; -use std::io; use std::io::BufRead; use std::path::Path; use std::sync::atomic::{AtomicUsize, Ordering}; use std::sync::Arc; -use std::thread; use std::time::Duration; +use std::{fs, io, thread}; + +use crc32fast::Hasher; + +use crate::directory::{WatchCallback, WatchCallbackList, WatchHandle}; pub const POLLING_INTERVAL: Duration = Duration::from_millis(if cfg!(test) { 1 } else { 500 }); @@ -99,9 +99,8 @@ mod tests { use std::mem; - use crate::directory::mmap_directory::atomic_write; - use super::*; + use crate::directory::mmap_directory::atomic_write; #[test] fn test_file_watcher_drop_watcher() -> crate::Result<()> { diff --git a/src/directory/footer.rs b/src/directory/footer.rs index 590088791..ca673390f 100644 --- a/src/directory/footer.rs +++ b/src/directory/footer.rs @@ -1,14 +1,13 @@ -use crate::directory::error::Incompatibility; -use crate::directory::FileSlice; -use crate::{ - directory::{AntiCallToken, TerminatingWrite}, - Version, INDEX_FORMAT_VERSION, -}; +use std::io; +use std::io::Write; + use common::{BinarySerializable, CountingWriter, DeserializeFrom, FixedSize, HasLen}; use crc32fast::Hasher; use serde::{Deserialize, Serialize}; -use std::io; -use std::io::Write; + +use crate::directory::error::Incompatibility; +use crate::directory::{AntiCallToken, FileSlice, TerminatingWrite}; +use crate::{Version, INDEX_FORMAT_VERSION}; const FOOTER_MAX_LEN: u32 = 50_000; @@ -64,7 +63,9 @@ impl Footer { if footer_magic_byte != FOOTER_MAGIC_NUMBER { return Err(io::Error::new( io::ErrorKind::InvalidData, - "Footer magic byte mismatch. File corrupted or index was created using old an tantivy version which is not supported anymore. Please use tantivy 0.15 or above to recreate the index.", + "Footer magic byte mismatch. File corrupted or index was created using old an \ + tantivy version which is not supported anymore. Please use tantivy 0.15 or above \ + to recreate the index.", )); } @@ -73,7 +74,7 @@ impl Footer { io::ErrorKind::InvalidData, format!( "Footer seems invalid as it suggests a footer len of {}. File is corrupted, \ - or the index was created with a different & old version of tantivy.", + or the index was created with a different & old version of tantivy.", footer_len ), )); @@ -154,12 +155,13 @@ impl TerminatingWrite for FooterProxy { #[cfg(test)] mod tests { - use crate::directory::footer::Footer; - use crate::directory::OwnedBytes; - use crate::directory::{footer::FOOTER_MAGIC_NUMBER, FileSlice}; - use common::BinarySerializable; use std::io; + use common::BinarySerializable; + + use crate::directory::footer::{Footer, FOOTER_MAGIC_NUMBER}; + use crate::directory::{FileSlice, OwnedBytes}; + #[test] fn test_deserialize_footer() { let mut buf: Vec = vec![]; @@ -183,8 +185,9 @@ mod tests { let err = Footer::extract_footer(fileslice).unwrap_err(); assert_eq!( err.to_string(), - "Footer magic byte mismatch. File corrupted or index was created using old an tantivy version which \ - is not supported anymore. Please use tantivy 0.15 or above to recreate the index." + "Footer magic byte mismatch. File corrupted or index was created using old an tantivy \ + version which is not supported anymore. Please use tantivy 0.15 or above to recreate \ + the index." ); } #[test] @@ -219,8 +222,8 @@ mod tests { assert_eq!(err.kind(), io::ErrorKind::InvalidData); assert_eq!( err.to_string(), - "Footer seems invalid as it suggests a footer len of 50001. File is corrupted, \ - or the index was created with a different & old version of tantivy." + "Footer seems invalid as it suggests a footer len of 50001. File is corrupted, or the \ + index was created with a different & old version of tantivy." ); } } diff --git a/src/directory/managed_directory.rs b/src/directory/managed_directory.rs index 381d4346f..0a5080052 100644 --- a/src/directory/managed_directory.rs +++ b/src/directory/managed_directory.rs @@ -1,24 +1,21 @@ +use std::collections::HashSet; +use std::io::Write; +use std::path::{Path, PathBuf}; +use std::sync::{Arc, RwLock, RwLockWriteGuard}; +use std::{io, result}; + +use crc32fast::Hasher; + use crate::core::MANAGED_FILEPATH; use crate::directory::error::{DeleteError, LockError, OpenReadError, OpenWriteError}; use crate::directory::footer::{Footer, FooterProxy}; -use crate::directory::GarbageCollectionResult; -use crate::directory::Lock; -use crate::directory::META_LOCK; -use crate::directory::{DirectoryLock, FileHandle}; -use crate::directory::{FileSlice, WritePtr}; -use crate::directory::{WatchCallback, WatchHandle}; +use crate::directory::{ + DirectoryLock, FileHandle, FileSlice, GarbageCollectionResult, Lock, WatchCallback, + WatchHandle, WritePtr, META_LOCK, +}; use crate::error::DataCorruption; use crate::Directory; -use crc32fast::Hasher; -use std::collections::HashSet; -use std::io; -use std::io::Write; -use std::path::{Path, PathBuf}; -use std::result; -use std::sync::RwLockWriteGuard; -use std::sync::{Arc, RwLock}; - /// Returns true iff the file is "managed". /// Non-managed file are not subject to garbage collection. /// @@ -344,12 +341,14 @@ impl Clone for ManagedDirectory { #[cfg(test)] mod tests_mmap_specific { - use crate::directory::{Directory, ManagedDirectory, MmapDirectory, TerminatingWrite}; use std::collections::HashSet; use std::io::Write; use std::path::{Path, PathBuf}; + use tempfile::TempDir; + use crate::directory::{Directory, ManagedDirectory, MmapDirectory, TerminatingWrite}; + #[test] fn test_managed_directory() { let tempdir = TempDir::new().unwrap(); diff --git a/src/directory/mmap_directory.rs b/src/directory/mmap_directory.rs index fab09d5b7..381ea1333 100644 --- a/src/directory/mmap_directory.rs +++ b/src/directory/mmap_directory.rs @@ -1,32 +1,28 @@ -use crate::core::META_FILEPATH; -use crate::directory::error::LockError; -use crate::directory::error::{DeleteError, OpenDirectoryError, OpenReadError, OpenWriteError}; -use crate::directory::file_watcher::FileWatcher; -use crate::directory::Directory; -use crate::directory::DirectoryLock; -use crate::directory::Lock; -use crate::directory::WatchCallback; -use crate::directory::WatchHandle; -use crate::directory::{AntiCallToken, FileHandle, OwnedBytes}; -use crate::directory::{ArcBytes, WeakArcBytes}; -use crate::directory::{TerminatingWrite, WritePtr}; +use std::collections::HashMap; +use std::convert::From; +use std::fs::{self, File, OpenOptions}; +use std::io::{self, BufWriter, Read, Seek, SeekFrom, Write}; +use std::ops::Deref; +use std::path::{Path, PathBuf}; +use std::sync::{Arc, RwLock}; +use std::{fmt, result}; + use fs2::FileExt; use memmap2::Mmap; use serde::{Deserialize, Serialize}; use stable_deref_trait::StableDeref; -use std::convert::From; -use std::fmt; -use std::fs::OpenOptions; -use std::fs::{self, File}; -use std::io::{self, Seek, SeekFrom}; -use std::io::{BufWriter, Read, Write}; -use std::path::{Path, PathBuf}; -use std::result; -use std::sync::Arc; -use std::sync::RwLock; -use std::{collections::HashMap, ops::Deref}; use tempfile::TempDir; +use crate::core::META_FILEPATH; +use crate::directory::error::{ + DeleteError, LockError, OpenDirectoryError, OpenReadError, OpenWriteError, +}; +use crate::directory::file_watcher::FileWatcher; +use crate::directory::{ + AntiCallToken, ArcBytes, Directory, DirectoryLock, FileHandle, Lock, OwnedBytes, + TerminatingWrite, WatchCallback, WatchHandle, WeakArcBytes, WritePtr, +}; + /// Create a default io error given a string. pub(crate) fn make_io_err(msg: String) -> io::Error { io::Error::new(io::ErrorKind::Other, msg) @@ -320,8 +316,7 @@ impl Directory for MmapDirectory { let mut mmap_cache = self.inner.mmap_cache.write().map_err(|_| { let msg = format!( - "Failed to acquired write lock \ - on mmap cache while reading {:?}", + "Failed to acquired write lock on mmap cache while reading {:?}", path ); let io_err = make_io_err(msg); @@ -457,6 +452,7 @@ impl Directory for MmapDirectory { #[cfg(windows)] { use std::os::windows::fs::OpenOptionsExt; + use winapi::um::winbase; open_opts @@ -476,15 +472,12 @@ mod tests { // There are more tests in directory/mod.rs // The following tests are specific to the MmapDirectory + use common::HasLen; + use super::*; use crate::indexer::LogMergePolicy; - use crate::Index; - use crate::ReloadPolicy; - use crate::{ - schema::{Schema, SchemaBuilder, TEXT}, - IndexSettings, - }; - use common::HasLen; + use crate::schema::{Schema, SchemaBuilder, TEXT}; + use crate::{Index, IndexSettings, ReloadPolicy}; #[test] fn test_open_non_existent_path() { diff --git a/src/directory/mod.rs b/src/directory/mod.rs index f93ca98cf..62ed18bc0 100644 --- a/src/directory/mod.rs +++ b/src/directory/mod.rs @@ -1,8 +1,4 @@ -/*! - -WORM (Write Once Read Many) directory abstraction. - -*/ +//! WORM (Write Once Read Many) directory abstraction. #[cfg(feature = "mmap")] mod mmap_directory; @@ -22,19 +18,19 @@ pub mod error; mod composite_file; +use std::io::BufWriter; +use std::path::PathBuf; + +pub use common::{AntiCallToken, TerminatingWrite}; + pub(crate) use self::composite_file::{CompositeFile, CompositeWrite}; -pub use self::directory::DirectoryLock; -pub use self::directory::{Directory, DirectoryClone}; +pub use self::directory::{Directory, DirectoryClone, DirectoryLock}; pub use self::directory_lock::{Lock, INDEX_WRITER_LOCK, META_LOCK}; pub(crate) use self::file_slice::{ArcBytes, WeakArcBytes}; pub use self::file_slice::{FileHandle, FileSlice}; pub use self::owned_bytes::OwnedBytes; pub use self::ram_directory::RamDirectory; pub use self::watch_event_router::{WatchCallback, WatchCallbackList, WatchHandle}; -pub use common::AntiCallToken; -pub use common::TerminatingWrite; -use std::io::BufWriter; -use std::path::PathBuf; /// Outcome of the Garbage collection pub struct GarbageCollectionResult { @@ -50,11 +46,10 @@ pub struct GarbageCollectionResult { pub failed_to_delete_files: Vec, } +pub use self::managed_directory::ManagedDirectory; #[cfg(feature = "mmap")] pub use self::mmap_directory::MmapDirectory; -pub use self::managed_directory::ManagedDirectory; - /// Write object for Directory. /// /// `WritePtr` are required to implement both Write diff --git a/src/directory/owned_bytes.rs b/src/directory/owned_bytes.rs index 581ef62b2..39ba93c1a 100644 --- a/src/directory/owned_bytes.rs +++ b/src/directory/owned_bytes.rs @@ -1,9 +1,10 @@ -use crate::directory::FileHandle; use std::io; use std::ops::Range; pub use ownedbytes::OwnedBytes; +use crate::directory::FileHandle; + impl FileHandle for OwnedBytes { fn read_bytes(&self, range: Range) -> io::Result { Ok(self.slice(range)) diff --git a/src/directory/ram_directory.rs b/src/directory/ram_directory.rs index 3d33fd8d0..f501b100d 100644 --- a/src/directory/ram_directory.rs +++ b/src/directory/ram_directory.rs @@ -1,19 +1,19 @@ -use crate::core::META_FILEPATH; -use crate::directory::error::{DeleteError, OpenReadError, OpenWriteError}; -use crate::directory::AntiCallToken; -use crate::directory::WatchCallbackList; -use crate::directory::{Directory, FileSlice, WatchCallback, WatchHandle}; -use crate::directory::{TerminatingWrite, WritePtr}; -use common::HasLen; -use fail::fail_point; use std::collections::HashMap; -use std::fmt; use std::io::{self, BufWriter, Cursor, Seek, SeekFrom, Write}; use std::path::{Path, PathBuf}; -use std::result; use std::sync::{Arc, RwLock}; +use std::{fmt, result}; + +use common::HasLen; +use fail::fail_point; use super::FileHandle; +use crate::core::META_FILEPATH; +use crate::directory::error::{DeleteError, OpenReadError, OpenWriteError}; +use crate::directory::{ + AntiCallToken, Directory, FileSlice, TerminatingWrite, WatchCallback, WatchCallbackList, + WatchHandle, WritePtr, +}; /// Writer associated with the `RamDirectory` /// @@ -40,7 +40,9 @@ impl Drop for VecWriter { fn drop(&mut self) { if !self.is_flushed { warn!( - "You forgot to flush {:?} before its writter got Drop. Do not rely on drop. This also occurs when the indexer crashed, so you may want to check the logs for the root cause.", + "You forgot to flush {:?} before its writter got Drop. Do not rely on drop. This \ + also occurs when the indexer crashed, so you may want to check the logs for the \ + root cause.", self.path ) } @@ -123,7 +125,6 @@ impl fmt::Debug for RamDirectory { /// /// It is mainly meant for unit testing. /// Writes are only made visible upon flushing. -/// #[derive(Clone, Default)] pub struct RamDirectory { fs: Arc>, @@ -233,11 +234,12 @@ impl Directory for RamDirectory { #[cfg(test)] mod tests { - use super::RamDirectory; - use crate::Directory; use std::io::Write; use std::path::Path; + use super::RamDirectory; + use crate::Directory; + #[test] fn test_persist() { let msg_atomic: &'static [u8] = b"atomic is the way"; diff --git a/src/directory/tests.rs b/src/directory/tests.rs index 354b1538c..45b463129 100644 --- a/src/directory/tests.rs +++ b/src/directory/tests.rs @@ -1,6 +1,3 @@ -use super::*; -use futures::channel::oneshot; -use futures::executor::block_on; use std::io::Write; use std::mem; use std::path::{Path, PathBuf}; @@ -9,6 +6,11 @@ use std::sync::atomic::{AtomicBool, AtomicUsize}; use std::sync::Arc; use std::time::Duration; +use futures::channel::oneshot; +use futures::executor::block_on; + +use super::*; + #[cfg(feature = "mmap")] mod mmap_directory_tests { use crate::directory::MmapDirectory; diff --git a/src/directory/watch_event_router.rs b/src/directory/watch_event_router.rs index c42d03be3..4e828f269 100644 --- a/src/directory/watch_event_router.rs +++ b/src/directory/watch_event_router.rs @@ -1,8 +1,7 @@ +use std::sync::{Arc, RwLock, Weak}; + use futures::channel::oneshot; use futures::{Future, TryFutureExt}; -use std::sync::Arc; -use std::sync::RwLock; -use std::sync::Weak; /// Cloneable wrapper for callbacks registered when watching files of a `Directory`. #[derive(Clone)] @@ -103,12 +102,14 @@ impl WatchCallbackList { #[cfg(test)] mod tests { - use crate::directory::{WatchCallback, WatchCallbackList}; - use futures::executor::block_on; use std::mem; use std::sync::atomic::{AtomicUsize, Ordering}; use std::sync::Arc; + use futures::executor::block_on; + + use crate::directory::{WatchCallback, WatchCallbackList}; + #[test] fn test_watch_event_router_simple() { let watch_event_router = WatchCallbackList::default(); diff --git a/src/docset.rs b/src/docset.rs index e5430b207..e4cb4ad69 100644 --- a/src/docset.rs +++ b/src/docset.rs @@ -1,7 +1,7 @@ +use std::borrow::{Borrow, BorrowMut}; + use crate::fastfield::AliveBitSet; use crate::DocId; -use std::borrow::Borrow; -use std::borrow::BorrowMut; /// Sentinel value returned when a DocSet has been entirely consumed. /// diff --git a/src/error.rs b/src/error.rs index 47b4f45e2..146112bd2 100644 --- a/src/error.rs +++ b/src/error.rs @@ -1,17 +1,14 @@ //! Definition of Tantivy's error and result. -use std::io; - -use crate::directory::error::{Incompatibility, LockError}; -use crate::fastfield::FastFieldNotAvailableError; -use crate::query; -use crate::{ - directory::error::{OpenDirectoryError, OpenReadError, OpenWriteError}, - schema, -}; -use std::fmt; use std::path::PathBuf; use std::sync::PoisonError; +use std::{fmt, io}; + +use crate::directory::error::{ + Incompatibility, LockError, OpenDirectoryError, OpenReadError, OpenWriteError, +}; +use crate::fastfield::FastFieldNotAvailableError; +use crate::{query, schema}; /// Represents a `DataCorruption` error. /// diff --git a/src/fastfield/alive_bitset.rs b/src/fastfield/alive_bitset.rs index 5c55e5317..3eca5e5bc 100644 --- a/src/fastfield/alive_bitset.rs +++ b/src/fastfield/alive_bitset.rs @@ -1,12 +1,12 @@ -use crate::space_usage::ByteCount; -use crate::DocId; -use common::intersect_bitsets; -use common::BitSet; -use common::ReadOnlyBitSet; -use ownedbytes::OwnedBytes; use std::io; use std::io::Write; +use common::{intersect_bitsets, BitSet, ReadOnlyBitSet}; +use ownedbytes::OwnedBytes; + +use crate::space_usage::ByteCount; +use crate::DocId; + /// Write a alive `BitSet` /// /// where `alive_bitset` is the set of alive `DocId`. @@ -168,11 +168,12 @@ mod tests { #[cfg(all(test, feature = "unstable"))] mod bench { - use super::AliveBitSet; use rand::prelude::IteratorRandom; use rand::thread_rng; use test::Bencher; + use super::AliveBitSet; + fn get_alive() -> Vec { let mut data = (0..1_000_000_u32).collect::>(); for _ in 0..(1_000_000) * 1 / 8 { diff --git a/src/fastfield/bytes/mod.rs b/src/fastfield/bytes/mod.rs index 97862b29c..a9bad4c7c 100644 --- a/src/fastfield/bytes/mod.rs +++ b/src/fastfield/bytes/mod.rs @@ -6,11 +6,12 @@ pub use self::writer::BytesFastFieldWriter; #[cfg(test)] mod tests { - use crate::schema::{BytesOptions, IndexRecordOption, Schema, Value}; - use crate::{query::TermQuery, schema::FAST, schema::INDEXED, schema::STORED}; - use crate::{DocAddress, DocSet, Index, Searcher, Term}; use std::ops::Deref; + use crate::query::TermQuery; + use crate::schema::{BytesOptions, IndexRecordOption, Schema, Value, FAST, INDEXED, STORED}; + use crate::{DocAddress, DocSet, Index, Searcher, Term}; + #[test] fn test_bytes() -> crate::Result<()> { let mut schema_builder = Schema::builder(); @@ -62,7 +63,7 @@ mod tests { assert_eq!(values.len(), 2); let values_bytes: Vec<&[u8]> = values .into_iter() - .flat_map(|value| value.bytes_value()) + .flat_map(|value| value.as_bytes()) .collect(); assert_eq!(values_bytes, &[&b"tantivy"[..], &b"lucene"[..]]); Ok(()) diff --git a/src/fastfield/bytes/reader.rs b/src/fastfield/bytes/reader.rs index 0e938c912..835aa2e31 100644 --- a/src/fastfield/bytes/reader.rs +++ b/src/fastfield/bytes/reader.rs @@ -1,5 +1,4 @@ -use crate::directory::FileSlice; -use crate::directory::OwnedBytes; +use crate::directory::{FileSlice, OwnedBytes}; use crate::fastfield::{BitpackedFastFieldReader, FastFieldReader, MultiValueLength}; use crate::DocId; diff --git a/src/fastfield/bytes/writer.rs b/src/fastfield/bytes/writer.rs index 3f79d44f2..91f74adf9 100644 --- a/src/fastfield/bytes/writer.rs +++ b/src/fastfield/bytes/writer.rs @@ -1,10 +1,9 @@ use std::io; +use crate::fastfield::serializer::CompositeFastFieldSerializer; +use crate::indexer::doc_id_mapping::DocIdMapping; use crate::schema::{Document, Field, Value}; use crate::DocId; -use crate::{ - fastfield::serializer::CompositeFastFieldSerializer, indexer::doc_id_mapping::DocIdMapping, -}; /// Writer for byte array (as in, any number of bytes per document) fast fields /// diff --git a/src/fastfield/error.rs b/src/fastfield/error.rs index ee00d2f15..5affdc4b9 100644 --- a/src/fastfield/error.rs +++ b/src/fastfield/error.rs @@ -1,6 +1,7 @@ -use crate::schema::FieldEntry; use std::result; +use crate::schema::FieldEntry; + /// `FastFieldNotAvailableError` is returned when the /// user requested for a fast field reader, and the field was not /// defined in the schema as a fast field. diff --git a/src/fastfield/facet_reader.rs b/src/fastfield/facet_reader.rs index ef354af61..2dbf66116 100644 --- a/src/fastfield/facet_reader.rs +++ b/src/fastfield/facet_reader.rs @@ -1,10 +1,10 @@ +use std::str; + use super::MultiValuedFastFieldReader; use crate::error::DataCorruption; use crate::schema::Facet; -use crate::termdict::TermDictionary; -use crate::termdict::TermOrdinal; +use crate::termdict::{TermDictionary, TermOrdinal}; use crate::DocId; -use std::str; /// The facet reader makes it possible to access the list of /// facets associated to a given document in a specific @@ -82,11 +82,8 @@ impl FacetReader { #[cfg(test)] mod tests { - use crate::Index; - use crate::{ - schema::{Facet, FacetOptions, SchemaBuilder, Value, STORED}, - DocAddress, Document, - }; + use crate::schema::{Facet, FacetOptions, SchemaBuilder, Value, STORED}; + use crate::{DocAddress, Document, Index}; #[test] fn test_facet_only_indexed() -> crate::Result<()> { @@ -106,7 +103,7 @@ mod tests { facet_reader.facet_ords(0u32, &mut facet_ords); assert_eq!(&facet_ords, &[2u64]); let doc = searcher.doc(DocAddress::new(0u32, 0u32))?; - let value = doc.get_first(facet_field).and_then(Value::facet); + let value = doc.get_first(facet_field).and_then(Value::as_facet); assert_eq!(value, None); Ok(()) } @@ -129,7 +126,7 @@ mod tests { facet_reader.facet_ords(0u32, &mut facet_ords); assert_eq!(&facet_ords, &[2u64]); let doc = searcher.doc(DocAddress::new(0u32, 0u32))?; - let value: Option<&Facet> = doc.get_first(facet_field).and_then(Value::facet); + let value: Option<&Facet> = doc.get_first(facet_field).and_then(Value::as_facet); assert_eq!(value, Facet::from_text("/a/b").ok().as_ref()); Ok(()) } diff --git a/src/fastfield/mod.rs b/src/fastfield/mod.rs index c5345cf97..168259ca8 100644 --- a/src/fastfield/mod.rs +++ b/src/fastfield/mod.rs @@ -1,51 +1,39 @@ -/*! -Column oriented field storage for tantivy. +//! Column oriented field storage for tantivy. +//! +//! It is the equivalent of `Lucene`'s `DocValues`. +//! +//! Fast fields is a column-oriented fashion storage of `tantivy`. +//! +//! It is designed for the fast random access of some document +//! fields given a document id. +//! +//! `FastField` are useful when a field is required for all or most of +//! the `DocSet` : for instance for scoring, grouping, filtering, or faceting. +//! +//! +//! Fields have to be declared as `FAST` in the schema. +//! Currently only 64-bits integers (signed or unsigned) are +//! supported. +//! +//! They are stored in a bit-packed fashion so that their +//! memory usage is directly linear with the amplitude of the +//! values stored. +//! +//! Read access performance is comparable to that of an array lookup. -It is the equivalent of `Lucene`'s `DocValues`. - -Fast fields is a column-oriented fashion storage of `tantivy`. - -It is designed for the fast random access of some document -fields given a document id. - -`FastField` are useful when a field is required for all or most of -the `DocSet` : for instance for scoring, grouping, filtering, or faceting. - - -Fields have to be declared as `FAST` in the schema. -Currently only 64-bits integers (signed or unsigned) are -supported. - -They are stored in a bit-packed fashion so that their -memory usage is directly linear with the amplitude of the -values stored. - -Read access performance is comparable to that of an array lookup. -*/ - -pub use self::alive_bitset::intersect_alive_bitsets; -pub use self::alive_bitset::write_alive_bitset; -pub use self::alive_bitset::AliveBitSet; +pub use self::alive_bitset::{intersect_alive_bitsets, write_alive_bitset, AliveBitSet}; pub use self::bytes::{BytesFastFieldReader, BytesFastFieldWriter}; pub use self::error::{FastFieldNotAvailableError, Result}; pub use self::facet_reader::FacetReader; pub use self::multivalued::{MultiValuedFastFieldReader, MultiValuedFastFieldWriter}; pub(crate) use self::reader::BitpackedFastFieldReader; -pub use self::reader::DynamicFastFieldReader; -pub use self::reader::FastFieldReader; +pub use self::reader::{DynamicFastFieldReader, FastFieldReader}; pub use self::readers::FastFieldReaders; -pub use self::serializer::CompositeFastFieldSerializer; -pub use self::serializer::FastFieldDataAccess; -pub use self::serializer::FastFieldStats; +pub use self::serializer::{CompositeFastFieldSerializer, FastFieldDataAccess, FastFieldStats}; pub use self::writer::{FastFieldsWriter, IntFastFieldWriter}; -use crate::schema::Cardinality; -use crate::schema::FieldType; -use crate::schema::Value; +use crate::chrono::{NaiveDateTime, Utc}; +use crate::schema::{Cardinality, FieldType, Type, Value}; use crate::DocId; -use crate::{ - chrono::{NaiveDateTime, Utc}, - schema::Type, -}; mod alive_bitset; mod bytes; @@ -213,22 +201,20 @@ fn value_to_u64(value: &Value) -> u64 { #[cfg(test)] mod tests { - use super::*; - use crate::directory::CompositeFile; - use crate::directory::{Directory, RamDirectory, WritePtr}; - use crate::merge_policy::NoMergePolicy; - use crate::schema::Field; - use crate::schema::Schema; - use crate::schema::FAST; - use crate::schema::{Document, IntOptions}; - use crate::{Index, SegmentId, SegmentReader}; + use std::collections::HashMap; + use std::path::Path; + use common::HasLen; use once_cell::sync::Lazy; use rand::prelude::SliceRandom; use rand::rngs::StdRng; use rand::SeedableRng; - use std::collections::HashMap; - use std::path::Path; + + use super::*; + use crate::directory::{CompositeFile, Directory, RamDirectory, WritePtr}; + use crate::merge_policy::NoMergePolicy; + use crate::schema::{Document, Field, IntOptions, Schema, FAST}; + use crate::{Index, SegmentId, SegmentReader}; pub static SCHEMA: Lazy = Lazy::new(|| { let mut schema_builder = Schema::builder(); @@ -407,7 +393,7 @@ mod tests { serializer.close().unwrap(); } let file = directory.open_read(path).unwrap(); - //assert_eq!(file.len(), 17710 as usize); //bitpacked size + // assert_eq!(file.len(), 17710 as usize); //bitpacked size assert_eq!(file.len(), 10175_usize); // linear interpol size { let fast_fields_composite = CompositeFile::open(&file)?; @@ -587,16 +573,16 @@ mod tests { #[cfg(all(test, feature = "unstable"))] mod bench { - use super::tests::FIELD; - use super::tests::{generate_permutation, SCHEMA}; - use super::*; - use crate::directory::CompositeFile; - use crate::directory::{Directory, RamDirectory, WritePtr}; - use crate::fastfield::FastFieldReader; use std::collections::HashMap; use std::path::Path; + use test::{self, Bencher}; + use super::tests::{generate_permutation, FIELD, SCHEMA}; + use super::*; + use crate::directory::{CompositeFile, Directory, RamDirectory, WritePtr}; + use crate::fastfield::FastFieldReader; + #[bench] fn bench_intfastfield_linear_veclookup(b: &mut Bencher) { let permutation = generate_permutation(); diff --git a/src/fastfield/multivalued/mod.rs b/src/fastfield/multivalued/mod.rs index b4c54e58b..777521874 100644 --- a/src/fastfield/multivalued/mod.rs +++ b/src/fastfield/multivalued/mod.rs @@ -7,23 +7,17 @@ pub use self::writer::MultiValuedFastFieldWriter; #[cfg(test)] mod tests { + use chrono::Duration; + use futures::executor::block_on; + use proptest::strategy::Strategy; + use proptest::{prop_oneof, proptest}; + use test_log::test; + use crate::collector::TopDocs; use crate::indexer::NoMergePolicy; use crate::query::QueryParser; - use crate::schema::Cardinality; - use crate::schema::Facet; - use crate::schema::FacetOptions; - use crate::schema::IntOptions; - use crate::schema::Schema; - use crate::Document; - use crate::Index; - use crate::Term; - use chrono::Duration; - use futures::executor::block_on; - use proptest::prop_oneof; - use proptest::proptest; - use proptest::strategy::Strategy; - use test_log::test; + use crate::schema::{Cardinality, Facet, FacetOptions, IntOptions, Schema}; + use crate::{Document, Index, Term}; #[test] fn test_multivalued_u64() -> crate::Result<()> { @@ -110,7 +104,7 @@ mod tests { retrieved_doc .get_first(date_field) .expect("cannot find value") - .date_value() + .as_date() .unwrap() .timestamp(), first_time_stamp.timestamp() @@ -119,7 +113,7 @@ mod tests { retrieved_doc .get_first(time_i) .expect("cannot find value") - .i64_value(), + .as_i64(), Some(1i64) ); } @@ -138,7 +132,7 @@ mod tests { retrieved_doc .get_first(date_field) .expect("cannot find value") - .date_value() + .as_date() .unwrap() .timestamp(), two_secs_ahead.timestamp() @@ -147,7 +141,7 @@ mod tests { retrieved_doc .get_first(time_i) .expect("cannot find value") - .i64_value(), + .as_i64(), Some(3i64) ); } @@ -180,7 +174,7 @@ mod tests { retrieved_doc .get_first(date_field) .expect("cannot find value") - .date_value() + .as_date() .expect("value not of Date type") .timestamp(), (first_time_stamp + Duration::seconds(offset_sec)).timestamp() @@ -189,7 +183,7 @@ mod tests { retrieved_doc .get_first(time_i) .expect("cannot find value") - .i64_value(), + .as_i64(), Some(time_i_val) ); } diff --git a/src/fastfield/multivalued/reader.rs b/src/fastfield/multivalued/reader.rs index 3278d94c4..29131e73a 100644 --- a/src/fastfield/multivalued/reader.rs +++ b/src/fastfield/multivalued/reader.rs @@ -10,7 +10,6 @@ use crate::DocId; /// The `vals_reader` will access the concatenated list of all /// values for all reader. /// The `idx_reader` associated, for each document, the index of its first value. -/// #[derive(Clone)] pub struct MultiValuedFastFieldReader { idx_reader: DynamicFastFieldReader, diff --git a/src/fastfield/multivalued/writer.rs b/src/fastfield/multivalued/writer.rs index faeffddc7..6114cedc3 100644 --- a/src/fastfield/multivalued/writer.rs +++ b/src/fastfield/multivalued/writer.rs @@ -1,13 +1,15 @@ +use std::io; + +use fnv::FnvHashMap; +use tantivy_bitpacker::minmax; + use crate::fastfield::serializer::BitpackedFastFieldSerializerLegacy; -use crate::fastfield::CompositeFastFieldSerializer; +use crate::fastfield::{value_to_u64, CompositeFastFieldSerializer}; +use crate::indexer::doc_id_mapping::DocIdMapping; use crate::postings::UnorderedTermId; use crate::schema::{Document, Field}; use crate::termdict::TermOrdinal; use crate::DocId; -use crate::{fastfield::value_to_u64, indexer::doc_id_mapping::DocIdMapping}; -use fnv::FnvHashMap; -use std::io; -use tantivy_bitpacker::minmax; /// Writer for multi-valued (as in, more than one value per document) /// int fast field. @@ -20,7 +22,8 @@ use tantivy_bitpacker::minmax; /// - add your document simply by calling `.add_document(...)`. /// /// The `MultiValuedFastFieldWriter` can be acquired from the -/// fastfield writer, by calling [`.get_multivalue_writer(...)`](./struct.FastFieldsWriter.html#method.get_multivalue_writer). +/// fastfield writer, by calling +/// [`.get_multivalue_writer(...)`](./struct.FastFieldsWriter.html#method.get_multivalue_writer). /// /// Once acquired, writing is done by calling calls to /// `.add_document_vals(&[u64])` once per document. @@ -131,7 +134,6 @@ impl MultiValuedFastFieldWriter { /// During the serialization of the segment, terms gets sorted and /// `tantivy` builds a mapping to convert this `UnorderedTermId` into /// term ordinals. - /// pub fn serialize( &self, serializer: &mut CompositeFastFieldSerializer, diff --git a/src/fastfield/reader.rs b/src/fastfield/reader.rs index 0fbefb7d6..feb5c91b7 100644 --- a/src/fastfield/reader.rs +++ b/src/fastfield/reader.rs @@ -1,25 +1,25 @@ -use super::FastValue; -use crate::directory::CompositeFile; -use crate::directory::FileSlice; -use crate::directory::OwnedBytes; -use crate::directory::{Directory, RamDirectory, WritePtr}; -use crate::fastfield::{CompositeFastFieldSerializer, FastFieldsWriter}; -use crate::schema::Schema; -use crate::schema::FAST; -use crate::DocId; -use common::BinarySerializable; -use fastfield_codecs::bitpacked::BitpackedFastFieldReader as BitpackedReader; -use fastfield_codecs::bitpacked::BitpackedFastFieldSerializer; -use fastfield_codecs::linearinterpol::LinearInterpolFastFieldReader; -use fastfield_codecs::linearinterpol::LinearInterpolFastFieldSerializer; -use fastfield_codecs::multilinearinterpol::MultiLinearInterpolFastFieldReader; -use fastfield_codecs::multilinearinterpol::MultiLinearInterpolFastFieldSerializer; -use fastfield_codecs::FastFieldCodecReader; -use fastfield_codecs::FastFieldCodecSerializer; use std::collections::HashMap; use std::marker::PhantomData; use std::path::Path; +use common::BinarySerializable; +use fastfield_codecs::bitpacked::{ + BitpackedFastFieldReader as BitpackedReader, BitpackedFastFieldSerializer, +}; +use fastfield_codecs::linearinterpol::{ + LinearInterpolFastFieldReader, LinearInterpolFastFieldSerializer, +}; +use fastfield_codecs::multilinearinterpol::{ + MultiLinearInterpolFastFieldReader, MultiLinearInterpolFastFieldSerializer, +}; +use fastfield_codecs::{FastFieldCodecReader, FastFieldCodecSerializer}; + +use super::FastValue; +use crate::directory::{CompositeFile, Directory, FileSlice, OwnedBytes, RamDirectory, WritePtr}; +use crate::fastfield::{CompositeFastFieldSerializer, FastFieldsWriter}; +use crate::schema::{Schema, FAST}; +use crate::DocId; + /// FastFieldReader is the trait to access fast field data. pub trait FastFieldReader: Clone { /// Return the value associated to the given document. @@ -64,7 +64,6 @@ pub trait FastFieldReader: Clone { #[derive(Clone)] /// DynamicFastFieldReader wraps different readers to access /// the various encoded fastfield data -/// pub enum DynamicFastFieldReader { /// Bitpacked compressed fastfield data. Bitpacked(FastFieldReaderCodecWrapper), @@ -146,7 +145,6 @@ impl FastFieldReader for DynamicFastFieldReader { /// Wrapper for accessing a fastfield. /// /// Holds the data and the codec to the read the data. -/// #[derive(Clone)] pub struct FastFieldReaderCodecWrapper { reader: CodecReader, @@ -162,7 +160,8 @@ impl FastFieldReaderCodecWrapper crate::Result> { self.typed_fast_field_reader(field) } @@ -171,8 +171,8 @@ impl FastFieldReaders { self.typed_fast_field_multi_reader(field) } - /// Returns a `u64s` multi-valued fast field reader reader associated to `field`, regardless of whether the given - /// field is effectively of type `u64` or not. + /// Returns a `u64s` multi-valued fast field reader reader associated to `field`, regardless of + /// whether the given field is effectively of type `u64` or not. /// /// If `field` is not a u64 multi-valued fast field, this method returns an Error. pub fn u64s_lenient(&self, field: Field) -> crate::Result> { diff --git a/src/fastfield/serializer/mod.rs b/src/fastfield/serializer/mod.rs index e9009ec49..b138689d9 100644 --- a/src/fastfield/serializer/mod.rs +++ b/src/fastfield/serializer/mod.rs @@ -1,16 +1,15 @@ -use crate::directory::CompositeWrite; -use crate::directory::WritePtr; -use crate::schema::Field; -use common::BinarySerializable; -use common::CountingWriter; -pub use fastfield_codecs::bitpacked::BitpackedFastFieldSerializer; -pub use fastfield_codecs::bitpacked::BitpackedFastFieldSerializerLegacy; +use std::io::{self, Write}; + +use common::{BinarySerializable, CountingWriter}; +pub use fastfield_codecs::bitpacked::{ + BitpackedFastFieldSerializer, BitpackedFastFieldSerializerLegacy, +}; use fastfield_codecs::linearinterpol::LinearInterpolFastFieldSerializer; use fastfield_codecs::multilinearinterpol::MultiLinearInterpolFastFieldSerializer; -pub use fastfield_codecs::FastFieldCodecSerializer; -pub use fastfield_codecs::FastFieldDataAccess; -pub use fastfield_codecs::FastFieldStats; -use std::io::{self, Write}; +pub use fastfield_codecs::{FastFieldCodecSerializer, FastFieldDataAccess, FastFieldStats}; + +use crate::directory::{CompositeWrite, WritePtr}; +use crate::schema::Field; /// `CompositeFastFieldSerializer` is in charge of serializing /// fastfields on disk. @@ -58,7 +57,8 @@ impl CompositeFastFieldSerializer { Ok(CompositeFastFieldSerializer { composite_write }) } - /// Serialize data into a new u64 fast field. The best compression codec will be chosen automatically. + /// Serialize data into a new u64 fast field. The best compression codec will be chosen + /// automatically. pub fn create_auto_detect_u64_fast_field( &mut self, field: Field, @@ -76,7 +76,8 @@ impl CompositeFastFieldSerializer { 0, ) } - /// Serialize data into a new u64 fast field. The best compression codec will be chosen automatically. + /// Serialize data into a new u64 fast field. The best compression codec will be chosen + /// automatically. pub fn create_auto_detect_u64_fast_field_with_idx( &mut self, field: Field, @@ -112,7 +113,8 @@ impl CompositeFastFieldSerializer { broken_estimation.1 ); } - // removing nan values for codecs with broken calculations, and max values which disables codecs + // removing nan values for codecs with broken calculations, and max values which disables + // codecs estimations.retain(|estimation| !estimation.0.is_nan() && estimation.0 != f32::MAX); estimations.sort_by(|a, b| a.0.partial_cmp(&b.0).unwrap()); let (_ratio, name, id) = estimations[0]; diff --git a/src/fastfield/writer.rs b/src/fastfield/writer.rs index 388ee19a9..34bc9099d 100644 --- a/src/fastfield/writer.rs +++ b/src/fastfield/writer.rs @@ -1,3 +1,10 @@ +use std::collections::HashMap; +use std::io; + +use common; +use fnv::FnvHashMap; +use tantivy_bitpacker::BlockedBitpacker; + use super::multivalued::MultiValuedFastFieldWriter; use super::serializer::FastFieldStats; use super::FastFieldDataAccess; @@ -6,11 +13,6 @@ use crate::indexer::doc_id_mapping::DocIdMapping; use crate::postings::UnorderedTermId; use crate::schema::{Cardinality, Document, Field, FieldEntry, FieldType, Schema}; use crate::termdict::TermOrdinal; -use common; -use fnv::FnvHashMap; -use std::collections::HashMap; -use std::io; -use tantivy_bitpacker::BlockedBitpacker; /// The fastfieldswriter regroup all of the fast field writers. pub struct FastFieldsWriter { @@ -324,7 +326,8 @@ struct WriterFastFieldAccessProvider<'map, 'bitp> { impl<'map, 'bitp> FastFieldDataAccess for WriterFastFieldAccessProvider<'map, 'bitp> { /// Return the value associated to the given doc. /// - /// Whenever possible use the Iterator passed to the fastfield creation instead, for performance reasons. + /// Whenever possible use the Iterator passed to the fastfield creation instead, for performance + /// reasons. /// /// # Panics /// @@ -332,7 +335,9 @@ impl<'map, 'bitp> FastFieldDataAccess for WriterFastFieldAccessProvider<'map, 'b fn get_val(&self, doc: u64) -> u64 { if let Some(doc_id_map) = self.doc_id_map { self.vals - .get(doc_id_map.get_old_doc_id(doc as u32) as usize) // consider extra FastFieldReader wrapper for non doc_id_map + .get(doc_id_map.get_old_doc_id(doc as u32) as usize) // consider extra + // FastFieldReader wrapper for + // non doc_id_map } else { self.vals.get(doc as usize) } diff --git a/src/fieldnorm/mod.rs b/src/fieldnorm/mod.rs index bbb85197f..9eb64fb71 100644 --- a/src/fieldnorm/mod.rs +++ b/src/fieldnorm/mod.rs @@ -21,32 +21,24 @@ mod reader; mod serializer; mod writer; +use self::code::{fieldnorm_to_id, id_to_fieldnorm}; pub use self::reader::{FieldNormReader, FieldNormReaders}; pub use self::serializer::FieldNormsSerializer; pub use self::writer::FieldNormsWriter; -use self::code::{fieldnorm_to_id, id_to_fieldnorm}; - #[cfg(test)] mod tests { - use crate::directory::CompositeFile; - use crate::directory::{Directory, RamDirectory, WritePtr}; - use crate::fieldnorm::FieldNormReader; - use crate::fieldnorm::FieldNormsSerializer; - use crate::fieldnorm::FieldNormsWriter; - use crate::query::Query; - use crate::query::TermQuery; - use crate::schema::IndexRecordOption; - use crate::schema::TextFieldIndexing; - use crate::schema::TextOptions; - use crate::schema::TEXT; - use crate::Index; - use crate::Term; - use crate::TERMINATED; - use once_cell::sync::Lazy; use std::path::Path; - use crate::schema::{Field, Schema, STORED}; + use once_cell::sync::Lazy; + + use crate::directory::{CompositeFile, Directory, RamDirectory, WritePtr}; + use crate::fieldnorm::{FieldNormReader, FieldNormsSerializer, FieldNormsWriter}; + use crate::query::{Query, TermQuery}; + use crate::schema::{ + Field, IndexRecordOption, Schema, TextFieldIndexing, TextOptions, STORED, TEXT, + }; + use crate::{Index, Term, TERMINATED}; pub static SCHEMA: Lazy = Lazy::new(|| { let mut schema_builder = Schema::builder(); diff --git a/src/fieldnorm/reader.rs b/src/fieldnorm/reader.rs index ea264b2ff..397d00f19 100644 --- a/src/fieldnorm/reader.rs +++ b/src/fieldnorm/reader.rs @@ -1,11 +1,10 @@ +use std::sync::Arc; + use super::{fieldnorm_to_id, id_to_fieldnorm}; -use crate::directory::CompositeFile; -use crate::directory::FileSlice; -use crate::directory::OwnedBytes; +use crate::directory::{CompositeFile, FileSlice, OwnedBytes}; use crate::schema::Field; use crate::space_usage::PerFieldSpaceUsage; use crate::DocId; -use std::sync::Arc; /// Reader for the fieldnorm (for each document, the number of tokens indexed in the /// field) of all indexed fields in the index. diff --git a/src/fieldnorm/serializer.rs b/src/fieldnorm/serializer.rs index 54043b1e9..316b4cfad 100644 --- a/src/fieldnorm/serializer.rs +++ b/src/fieldnorm/serializer.rs @@ -1,9 +1,9 @@ -use crate::directory::CompositeWrite; -use crate::directory::WritePtr; -use crate::schema::Field; use std::io; use std::io::Write; +use crate::directory::{CompositeWrite, WritePtr}; +use crate::schema::Field; + /// The fieldnorms serializer is in charge of /// the serialization of field norms for all fields. pub struct FieldNormsSerializer { diff --git a/src/fieldnorm/writer.rs b/src/fieldnorm/writer.rs index 0a293a798..4e14019a7 100644 --- a/src/fieldnorm/writer.rs +++ b/src/fieldnorm/writer.rs @@ -1,12 +1,11 @@ -use crate::{indexer::doc_id_mapping::DocIdMapping, DocId}; - -use super::fieldnorm_to_id; -use super::FieldNormsSerializer; -use crate::schema::Field; -use crate::schema::Schema; use std::cmp::Ordering; use std::{io, iter}; +use super::{fieldnorm_to_id, FieldNormsSerializer}; +use crate::indexer::doc_id_mapping::DocIdMapping; +use crate::schema::{Field, Schema}; +use crate::DocId; + /// The `FieldNormsWriter` is in charge of tracking the fieldnorm byte /// of each document for each field with field norms. /// diff --git a/src/functional_test.rs b/src/functional_test.rs index 4128e5f2f..75c94a568 100644 --- a/src/functional_test.rs +++ b/src/functional_test.rs @@ -1,14 +1,10 @@ -use crate::schema; -use crate::Index; -use crate::IndexSettings; -use crate::IndexSortByField; -use crate::Order; -use crate::Searcher; -use crate::{doc, schema::*}; -use rand::thread_rng; -use rand::Rng; use std::collections::HashSet; +use rand::{thread_rng, Rng}; + +use crate::schema::*; +use crate::{doc, schema, Index, IndexSettings, IndexSortByField, Order, Searcher}; + fn check_index_content(searcher: &Searcher, vals: &[u64]) -> crate::Result<()> { assert!(searcher.segment_readers().len() < 20); assert_eq!(searcher.num_docs() as usize, vals.len()); @@ -130,14 +126,12 @@ fn test_functional_indexing_sorted() -> crate::Result<()> { Ok(()) } -const LOREM: &str = "Doc Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed \ - do eiusmod tempor incididunt ut labore et dolore magna aliqua. \ - Ut enim ad minim veniam, quis nostrud exercitation ullamco \ - laboris nisi ut aliquip ex ea commodo consequat. Duis aute irure \ - dolor in reprehenderit in voluptate velit esse cillum dolore eu \ - fugiat nulla pariatur. Excepteur sint occaecat cupidatat non \ - proident, sunt in culpa qui officia deserunt mollit anim id est \ - laborum."; +const LOREM: &str = "Doc Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod \ + tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, \ + quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo \ + consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse \ + cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat \ + non proident, sunt in culpa qui officia deserunt mollit anim id est laborum."; fn get_text() -> String { use rand::seq::SliceRandom; let mut rng = thread_rng(); diff --git a/src/indexer/delete_queue.rs b/src/indexer/delete_queue.rs index 285fc86c2..fb39bd089 100644 --- a/src/indexer/delete_queue.rs +++ b/src/indexer/delete_queue.rs @@ -1,9 +1,9 @@ -use super::operation::DeleteOperation; -use crate::Opstamp; - use std::ops::DerefMut; use std::sync::{Arc, RwLock, Weak}; +use super::operation::DeleteOperation; +use crate::Opstamp; + // The DeleteQueue is similar in conceptually to a multiple // consumer single producer broadcast channel. // @@ -13,12 +13,10 @@ use std::sync::{Arc, RwLock, Weak}; // which points to a specific place of the `DeleteQueue`. // // New consumer can be created in two ways -// - calling `delete_queue.cursor()` returns a cursor, that -// will include all future delete operation (and some or none -// of the past operations... The client is in charge of checking the opstamps.). -// - cloning an existing cursor returns a new cursor, that -// is at the exact same position, and can now advance independently -// from the original cursor. +// - calling `delete_queue.cursor()` returns a cursor, that will include all future delete operation +// (and some or none of the past operations... The client is in charge of checking the opstamps.). +// - cloning an existing cursor returns a new cursor, that is at the exact same position, and can +// now advance independently from the original cursor. #[derive(Default)] struct InnerDeleteQueue { writer: Vec, @@ -179,8 +177,8 @@ pub struct DeleteCursor { impl DeleteCursor { /// Skips operations and position it so that - /// - either all of the delete operation currently in the - /// queue are consume and the next get will return None. + /// - either all of the delete operation currently in the queue are consume and the next get + /// will return None. /// - the next get will return the first operation with an /// `opstamp >= target_opstamp`. pub fn skip_to(&mut self, target_opstamp: Opstamp) { diff --git a/src/indexer/demuxer.rs b/src/indexer/demuxer.rs index 47bb8a47f..54409eade 100644 --- a/src/indexer/demuxer.rs +++ b/src/indexer/demuxer.rs @@ -5,8 +5,8 @@ use crate::fastfield::AliveBitSet; use crate::{merge_filtered_segments, Directory, Index, IndexSettings, Segment, SegmentOrdinal}; /// DemuxMapping can be used to reorganize data from multiple segments. /// -/// DemuxMapping is useful in a multitenant settings, in which each document might actually belong to a different tenant. -/// It allows to reorganize documents as follows: +/// DemuxMapping is useful in a multitenant settings, in which each document might actually belong +/// to a different tenant. It allows to reorganize documents as follows: /// /// e.g. if you have two tenant ids TENANT_A and TENANT_B and two segments with /// the documents (simplified) @@ -18,7 +18,8 @@ use crate::{merge_filtered_segments, Directory, Index, IndexSettings, Segment, S /// Seg 2 [TENANT_B, TENANT_B] /// /// Demuxing is the tool for that. -/// Semantically you can define a mapping from [old segment ordinal, old doc_id] -> [new segment ordinal]. +/// Semantically you can define a mapping from [old segment ordinal, old doc_id] -> [new segment +/// ordinal]. #[derive(Debug, Default)] pub struct DemuxMapping { /// [index old segment ordinal] -> [index doc_id] = new segment ordinal @@ -132,27 +133,24 @@ pub fn demux( #[cfg(test)] mod tests { - use crate::{ - collector::TopDocs, - directory::RamDirectory, - query::QueryParser, - schema::{Schema, TEXT}, - DocAddress, Term, - }; - use super::*; + use crate::collector::TopDocs; + use crate::directory::RamDirectory; + use crate::query::QueryParser; + use crate::schema::{Schema, TEXT}; + use crate::{DocAddress, Term}; #[test] fn test_demux_map_to_deletebitset() { let max_value = 2; let mut demux_mapping = DemuxMapping::default(); - //segment ordinal 0 mapping + // segment ordinal 0 mapping let mut doc_id_to_segment = DocIdToSegmentOrdinal::with_max_doc(max_value); doc_id_to_segment.set(0, 1); doc_id_to_segment.set(1, 0); demux_mapping.add(doc_id_to_segment); - //segment ordinal 1 mapping + // segment ordinal 1 mapping let mut doc_id_to_segment = DocIdToSegmentOrdinal::with_max_doc(max_value); doc_id_to_segment.set(0, 1); doc_id_to_segment.set(1, 1); @@ -235,13 +233,13 @@ mod tests { let mut demux_mapping = DemuxMapping::default(); { let max_value = 2; - //segment ordinal 0 mapping + // segment ordinal 0 mapping let mut doc_id_to_segment = DocIdToSegmentOrdinal::with_max_doc(max_value); doc_id_to_segment.set(0, 1); doc_id_to_segment.set(1, 0); demux_mapping.add(doc_id_to_segment); - //segment ordinal 1 mapping + // segment ordinal 1 mapping let mut doc_id_to_segment = DocIdToSegmentOrdinal::with_max_doc(max_value); doc_id_to_segment.set(0, 1); doc_id_to_segment.set(1, 1); diff --git a/src/indexer/doc_id_mapping.rs b/src/indexer/doc_id_mapping.rs index e55f3f3ed..12ebb5361 100644 --- a/src/indexer/doc_id_mapping.rs +++ b/src/indexer/doc_id_mapping.rs @@ -1,13 +1,12 @@ //! This module is used when sorting the index by a property, e.g. //! to get mappings from old doc_id to new doc_id and vice versa, after sorting -//! + +use std::cmp::Reverse; +use std::ops::Index; use super::SegmentWriter; -use crate::{ - schema::{Field, Schema}, - DocId, IndexSortByField, Order, SegmentOrdinal, TantivyError, -}; -use std::{cmp::Reverse, ops::Index}; +use crate::schema::{Field, Schema}; +use crate::{DocId, IndexSortByField, Order, SegmentOrdinal, TantivyError}; /// Struct to provide mapping from new doc_id to old doc_id and segment. #[derive(Clone)] @@ -152,11 +151,12 @@ pub(crate) fn get_doc_id_mapping_from_field( #[cfg(test)] mod tests_indexsorting { + use crate::collector::TopDocs; use crate::fastfield::FastFieldReader; use crate::indexer::doc_id_mapping::DocIdMapping; - use crate::{collector::TopDocs, query::QueryParser, schema::*}; - use crate::{schema::Schema, DocAddress}; - use crate::{Index, IndexSettings, IndexSortByField, Order}; + use crate::query::QueryParser; + use crate::schema::{Schema, *}; + use crate::{DocAddress, Index, IndexSettings, IndexSortByField, Order}; fn create_test_index( index_settings: Option, @@ -217,7 +217,7 @@ mod tests_indexsorting { ]; for option in options { - //let options = get_text_options(); + // let options = get_text_options(); // no index_sort let index = create_test_index(None, option.clone())?; let my_text_field = index.schema().get_field("text_field").unwrap(); @@ -318,7 +318,7 @@ mod tests_indexsorting { .doc(DocAddress::new(0, 3))? .get_first(my_string_field) .unwrap() - .text(), + .as_text(), Some("blublub") ); } @@ -341,7 +341,7 @@ mod tests_indexsorting { .doc(DocAddress::new(0, 0))? .get_first(my_string_field) .unwrap() - .text(), + .as_text(), Some("blublub") ); let doc = searcher.doc(DocAddress::new(0, 4))?; @@ -363,7 +363,7 @@ mod tests_indexsorting { { let doc = searcher.doc(DocAddress::new(0, 4))?; assert_eq!( - doc.get_first(my_string_field).unwrap().text(), + doc.get_first(my_string_field).unwrap().as_text(), Some("blublub") ); } diff --git a/src/indexer/doc_opstamp_mapping.rs b/src/indexer/doc_opstamp_mapping.rs index d5a905cab..b209024be 100644 --- a/src/indexer/doc_opstamp_mapping.rs +++ b/src/indexer/doc_opstamp_mapping.rs @@ -1,5 +1,4 @@ -use crate::DocId; -use crate::Opstamp; +use crate::{DocId, Opstamp}; // Doc to opstamp is used to identify which // document should be deleted. diff --git a/src/indexer/index_writer.rs b/src/indexer/index_writer.rs index 74b70b27a..8d6c0c40e 100644 --- a/src/indexer/index_writer.rs +++ b/src/indexer/index_writer.rs @@ -1,14 +1,19 @@ +use std::ops::Range; +use std::sync::Arc; +use std::thread; +use std::thread::JoinHandle; + +use common::BitSet; +use crossbeam::channel; +use futures::executor::block_on; +use futures::future::Future; +use smallvec::smallvec; + use super::operation::{AddOperation, UserOperation}; use super::segment_updater::SegmentUpdater; -use super::PreparedCommit; -use crate::core::Index; -use crate::core::Segment; -use crate::core::SegmentComponent; -use crate::core::SegmentId; -use crate::core::SegmentMeta; -use crate::core::SegmentReader; -use crate::directory::TerminatingWrite; -use crate::directory::{DirectoryLock, GarbageCollectionResult}; +use super::{AddBatch, AddBatchReceiver, AddBatchSender, PreparedCommit}; +use crate::core::{Index, Segment, SegmentComponent, SegmentId, SegmentMeta, SegmentReader}; +use crate::directory::{DirectoryLock, GarbageCollectionResult, TerminatingWrite}; use crate::docset::{DocSet, TERMINATED}; use crate::error::TantivyError; use crate::fastfield::write_alive_bitset; @@ -17,24 +22,9 @@ use crate::indexer::doc_opstamp_mapping::DocToOpstampMapping; use crate::indexer::index_writer_status::IndexWriterStatus; use crate::indexer::operation::DeleteOperation; use crate::indexer::stamper::Stamper; -use crate::indexer::MergePolicy; -use crate::indexer::SegmentEntry; -use crate::indexer::SegmentWriter; -use crate::schema::Document; -use crate::schema::IndexRecordOption; -use crate::schema::Term; +use crate::indexer::{MergePolicy, SegmentEntry, SegmentWriter}; +use crate::schema::{Document, IndexRecordOption, Term}; use crate::Opstamp; -use common::BitSet; -use crossbeam::channel; -use futures::executor::block_on; -use futures::future::Future; -use smallvec::smallvec; -use std::ops::Range; -use std::sync::Arc; -use std::thread; -use std::thread::JoinHandle; - -use super::{AddBatch, AddBatchReceiver, AddBatchSender}; // Size of the margin for the heap. A segment is closed when the remaining memory // in the heap goes below MARGIN_IN_BYTES. @@ -392,7 +382,13 @@ impl IndexWriter { fn operation_receiver(&self) -> crate::Result { self.index_writer_status .operation_receiver() - .ok_or_else(|| crate::TantivyError::ErrorInThread("The index writer was killed. It can happen if an indexing worker encounterred an Io error for instance.".to_string())) + .ok_or_else(|| { + crate::TantivyError::ErrorInThread( + "The index writer was killed. It can happen if an indexing worker \ + encounterred an Io error for instance." + .to_string(), + ) + }) } /// Spawns a new worker thread for indexing. @@ -653,7 +649,6 @@ impl IndexWriter { /// /// Commit returns the `opstamp` of the last document /// that made it in the commit. - /// pub fn commit(&mut self) -> crate::Result { self.prepare_commit()?.commit() } @@ -780,8 +775,7 @@ impl Drop for IndexWriter { #[cfg(test)] mod tests { - use std::collections::HashMap; - use std::collections::HashSet; + use std::collections::{HashMap, HashSet}; use futures::executor::block_on; use proptest::prelude::*; @@ -794,31 +788,20 @@ mod tests { use crate::error::*; use crate::fastfield::FastFieldReader; use crate::indexer::NoMergePolicy; - use crate::query::QueryParser; - use crate::query::TermQuery; - use crate::schema::Cardinality; - use crate::schema::Facet; - use crate::schema::FacetOptions; - use crate::schema::IntOptions; - use crate::schema::TextFieldIndexing; - use crate::schema::TextOptions; - use crate::schema::STORED; - use crate::schema::TEXT; - use crate::schema::{self, IndexRecordOption, FAST, INDEXED, STRING}; - use crate::DocAddress; - use crate::Index; - use crate::ReloadPolicy; - use crate::Term; - use crate::{IndexSettings, IndexSortByField, Order}; + use crate::query::{QueryParser, TermQuery}; + use crate::schema::{ + self, Cardinality, Facet, FacetOptions, IndexRecordOption, IntOptions, TextFieldIndexing, + TextOptions, FAST, INDEXED, STORED, STRING, TEXT, + }; + use crate::{DocAddress, Index, IndexSettings, IndexSortByField, Order, ReloadPolicy, Term}; - const LOREM: &str = "Doc Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed \ - do eiusmod tempor incididunt ut labore et dolore magna aliqua. \ - Ut enim ad minim veniam, quis nostrud exercitation ullamco \ - laboris nisi ut aliquip ex ea commodo consequat. Duis aute irure \ - dolor in reprehenderit in voluptate velit esse cillum dolore eu \ - fugiat nulla pariatur. Excepteur sint occaecat cupidatat non \ - proident, sunt in culpa qui officia deserunt mollit anim id est \ - laborum."; + const LOREM: &str = "Doc Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do \ + eiusmod tempor incididunt ut labore et dolore magna aliqua. Ut enim ad \ + minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip \ + ex ea commodo consequat. Duis aute irure dolor in reprehenderit in \ + voluptate velit esse cillum dolore eu fugiat nulla pariatur. Excepteur \ + sint occaecat cupidatat non proident, sunt in culpa qui officia deserunt \ + mollit anim id est laborum."; #[test] fn test_operations_group() { @@ -973,8 +956,8 @@ mod tests { let index_writer = index.writer(3_000_000).unwrap(); assert_eq!( format!("{:?}", index_writer.get_merge_policy()), - "LogMergePolicy { min_num_segments: 8, max_docs_before_merge: 10000000, min_layer_size: 10000, \ - level_log_size: 0.75, del_docs_ratio_before_merge: 1.0 }" + "LogMergePolicy { min_num_segments: 8, max_docs_before_merge: 10000000, \ + min_layer_size: 10000, level_log_size: 0.75, del_docs_ratio_before_merge: 1.0 }" ); let merge_policy = Box::new(NoMergePolicy::default()); index_writer.set_merge_policy(merge_policy); @@ -1547,12 +1530,7 @@ mod tests { let store_reader = segment_reader.get_store_reader().unwrap(); // test store iterator for doc in store_reader.iter(segment_reader.alive_bitset()) { - let id = doc - .unwrap() - .get_first(id_field) - .unwrap() - .u64_value() - .unwrap(); + let id = doc.unwrap().get_first(id_field).unwrap().as_u64().unwrap(); assert!(expected_ids_and_num_occurences.contains_key(&id)); } // test store random access @@ -1562,7 +1540,7 @@ mod tests { .unwrap() .get_first(id_field) .unwrap() - .u64_value() + .as_u64() .unwrap(); assert!(expected_ids_and_num_occurences.contains_key(&id)); let id2 = store_reader @@ -1570,7 +1548,7 @@ mod tests { .unwrap() .get_first(multi_numbers) .unwrap() - .u64_value() + .as_u64() .unwrap(); assert_eq!(id, id2); } diff --git a/src/indexer/index_writer_status.rs b/src/indexer/index_writer_status.rs index 6b1ee2680..428b703d2 100644 --- a/src/indexer/index_writer_status.rs +++ b/src/indexer/index_writer_status.rs @@ -90,10 +90,12 @@ impl Drop for IndexWriterBomb { #[cfg(test)] mod tests { - use super::IndexWriterStatus; - use crossbeam::channel; use std::mem; + use crossbeam::channel; + + use super::IndexWriterStatus; + #[test] fn test_bomb_goes_boom() { let (_tx, rx) = channel::bounded(10); diff --git a/src/indexer/log_merge_policy.rs b/src/indexer/log_merge_policy.rs index 913834a55..98a840a51 100644 --- a/src/indexer/log_merge_policy.rs +++ b/src/indexer/log_merge_policy.rs @@ -1,7 +1,9 @@ +use std::cmp; + +use itertools::Itertools; + use super::merge_policy::{MergeCandidate, MergePolicy}; use crate::core::SegmentMeta; -use itertools::Itertools; -use std::cmp; const DEFAULT_LEVEL_LOG_SIZE: f64 = 0.75; const DEFAULT_MIN_LAYER_SIZE: u32 = 10_000; @@ -139,14 +141,14 @@ impl Default for LogMergePolicy { #[cfg(test)] mod tests { - use super::*; - use crate::{ - core::{SegmentId, SegmentMeta, SegmentMetaInventory}, - schema, - }; - use crate::{indexer::merge_policy::MergePolicy, schema::INDEXED}; use once_cell::sync::Lazy; + use super::*; + use crate::core::{SegmentId, SegmentMeta, SegmentMetaInventory}; + use crate::indexer::merge_policy::MergePolicy; + use crate::schema; + use crate::schema::INDEXED; + static INVENTORY: Lazy = Lazy::new(SegmentMetaInventory::default); use crate::Index; diff --git a/src/indexer/merge_operation.rs b/src/indexer/merge_operation.rs index 28a0e53ee..6d547ff3e 100644 --- a/src/indexer/merge_operation.rs +++ b/src/indexer/merge_operation.rs @@ -1,9 +1,8 @@ -use crate::Opstamp; -use crate::SegmentId; -use crate::{Inventory, TrackedObject}; use std::collections::HashSet; use std::ops::Deref; +use crate::{Inventory, Opstamp, SegmentId, TrackedObject}; + #[derive(Default)] pub(crate) struct MergeOperationInventory(Inventory); diff --git a/src/indexer/merge_policy.rs b/src/indexer/merge_policy.rs index 93499e071..4b356518a 100644 --- a/src/indexer/merge_policy.rs +++ b/src/indexer/merge_policy.rs @@ -1,8 +1,8 @@ -use crate::core::SegmentId; -use crate::core::SegmentMeta; use std::fmt::Debug; use std::marker; +use crate::core::{SegmentId, SegmentMeta}; + /// Set of segment suggested for a merge. #[derive(Debug, Clone)] pub struct MergeCandidate(pub Vec); @@ -39,8 +39,7 @@ impl MergePolicy for NoMergePolicy { pub mod tests { use super::*; - use crate::core::SegmentId; - use crate::core::SegmentMeta; + use crate::core::{SegmentId, SegmentMeta}; /// `MergePolicy` useful for test purposes. /// diff --git a/src/indexer/merger.rs b/src/indexer/merger.rs index 047dd0df1..34b7b7daa 100644 --- a/src/indexer/merger.rs +++ b/src/indexer/merger.rs @@ -1,41 +1,30 @@ -use crate::error::DataCorruption; -use crate::fastfield::AliveBitSet; -use crate::fastfield::CompositeFastFieldSerializer; -use crate::fastfield::DynamicFastFieldReader; -use crate::fastfield::FastFieldDataAccess; -use crate::fastfield::FastFieldReader; -use crate::fastfield::FastFieldStats; -use crate::fastfield::MultiValueLength; -use crate::fastfield::MultiValuedFastFieldReader; -use crate::fieldnorm::FieldNormsSerializer; -use crate::fieldnorm::FieldNormsWriter; -use crate::fieldnorm::{FieldNormReader, FieldNormReaders}; -use crate::indexer::doc_id_mapping::SegmentDocIdMapping; -use crate::indexer::SegmentSerializer; -use crate::postings::Postings; -use crate::postings::{InvertedIndexSerializer, SegmentPostings}; -use crate::schema::Cardinality; -use crate::schema::FieldType; -use crate::schema::{Field, Schema}; -use crate::store::StoreWriter; -use crate::termdict::TermMerger; -use crate::termdict::TermOrdinal; -use crate::IndexSettings; -use crate::IndexSortByField; -use crate::{core::Segment, indexer::doc_id_mapping::expect_field_id_for_sort_field}; -use crate::{core::SegmentReader, Order}; -use crate::{ - docset::{DocSet, TERMINATED}, - SegmentOrdinal, -}; -use crate::{DocId, InvertedIndexReader, SegmentComponent}; -use itertools::Itertools; -use measure_time::debug_time; use std::cmp; use std::collections::HashMap; use std::sync::Arc; + +use itertools::Itertools; +use measure_time::debug_time; use tantivy_bitpacker::minmax; +use crate::core::{Segment, SegmentReader}; +use crate::docset::{DocSet, TERMINATED}; +use crate::error::DataCorruption; +use crate::fastfield::{ + AliveBitSet, CompositeFastFieldSerializer, DynamicFastFieldReader, FastFieldDataAccess, + FastFieldReader, FastFieldStats, MultiValueLength, MultiValuedFastFieldReader, +}; +use crate::fieldnorm::{FieldNormReader, FieldNormReaders, FieldNormsSerializer, FieldNormsWriter}; +use crate::indexer::doc_id_mapping::{expect_field_id_for_sort_field, SegmentDocIdMapping}; +use crate::indexer::SegmentSerializer; +use crate::postings::{InvertedIndexSerializer, Postings, SegmentPostings}; +use crate::schema::{Cardinality, Field, FieldType, Schema}; +use crate::store::StoreWriter; +use crate::termdict::{TermMerger, TermOrdinal}; +use crate::{ + DocId, IndexSettings, IndexSortByField, InvertedIndexReader, Order, SegmentComponent, + SegmentOrdinal, +}; + /// Segment's max doc must be `< MAX_DOC_LIMIT`. /// /// We do not allow segments with more than @@ -46,8 +35,8 @@ fn estimate_total_num_tokens_in_single_segment( field: Field, ) -> crate::Result { // There are no deletes. We can simply use the exact value saved into the posting list. - // Note that this value is not necessarily exact as it could have been the result of a merge between - // segments themselves containing deletes. + // Note that this value is not necessarily exact as it could have been the result of a merge + // between segments themselves containing deletes. if !reader.has_deletes() { return Ok(reader.inverted_index(field)?.total_num_tokens()); } @@ -218,8 +207,8 @@ impl IndexMerger { // sort segments by their natural sort setting if max_doc >= MAX_DOC_LIMIT { let err_msg = format!( - "The segment resulting from this merge would have {} docs,\ - which exceeds the limit {}.", + "The segment resulting from this merge would have {} docs,which exceeds the limit \ + {}.", max_doc, MAX_DOC_LIMIT ); return Err(crate::TantivyError::InvalidArgument(err_msg)); @@ -295,10 +284,10 @@ impl IndexMerger { let field_type = field_entry.field_type(); match field_type { FieldType::Facet(_) => { - let term_ordinal_mapping = term_ord_mappings - .remove(&field) - .expect("Logic Error in Tantivy (Please report). Facet field should have required a\ - `term_ordinal_mapping`."); + let term_ordinal_mapping = term_ord_mappings.remove(&field).expect( + "Logic Error in Tantivy (Please report). Facet field should have required \ + a`term_ordinal_mapping`.", + ); self.write_hierarchical_facet_field( field, &term_ordinal_mapping, @@ -340,25 +329,29 @@ impl IndexMerger { fast_field_serializer: &mut CompositeFastFieldSerializer, doc_id_mapping: &SegmentDocIdMapping, ) -> crate::Result<()> { - let (min_value, max_value) = self.readers.iter().filter_map(|reader|{ - let u64_reader: DynamicFastFieldReader = reader - .fast_fields() - .typed_fast_field_reader(field) - .expect("Failed to find a reader for single fast field. This is a tantivy bug and it should never happen."); + let (min_value, max_value) = self + .readers + .iter() + .filter_map(|reader| { + let u64_reader: DynamicFastFieldReader = + reader.fast_fields().typed_fast_field_reader(field).expect( + "Failed to find a reader for single fast field. This is a tantivy bug and \ + it should never happen.", + ); compute_min_max_val(&u64_reader, reader) }) - .reduce(|a, b| { - (a.0.min(b.0), a.1.max(b.1)) - }).expect("Unexpected error, empty readers in IndexMerger"); + .reduce(|a, b| (a.0.min(b.0), a.1.max(b.1))) + .expect("Unexpected error, empty readers in IndexMerger"); let fast_field_readers = self .readers .iter() .map(|reader| { - let u64_reader: DynamicFastFieldReader = reader - .fast_fields() - .typed_fast_field_reader(field) - .expect("Failed to find a reader for single fast field. This is a tantivy bug and it should never happen."); + let u64_reader: DynamicFastFieldReader = + reader.fast_fields().typed_fast_field_reader(field).expect( + "Failed to find a reader for single fast field. This is a tantivy bug and \ + it should never happen.", + ); u64_reader }) .collect::>(); @@ -574,12 +567,20 @@ impl IndexMerger { fast_field_serializer: &mut CompositeFastFieldSerializer, doc_id_mapping: &SegmentDocIdMapping, ) -> crate::Result> { - let reader_ordinal_and_field_accessors = self.readers.iter().map(|reader|{ - let u64s_reader: MultiValuedFastFieldReader = reader.fast_fields() - .typed_fast_field_multi_reader(field) - .expect("Failed to find index for multivalued field. This is a bug in tantivy, please report."); - (reader, u64s_reader) - }).collect::>(); + let reader_ordinal_and_field_accessors = self + .readers + .iter() + .map(|reader| { + let u64s_reader: MultiValuedFastFieldReader = reader + .fast_fields() + .typed_fast_field_multi_reader(field) + .expect( + "Failed to find index for multivalued field. This is a bug in tantivy, \ + please report.", + ); + (reader, u64s_reader) + }) + .collect::>(); Self::write_1_n_fast_field_idx_generic( field, @@ -641,8 +642,8 @@ impl IndexMerger { Ok(()) } - /// Creates a mapping if the segments are stacked. this is helpful to merge codelines between index - /// sorting and the others + /// Creates a mapping if the segments are stacked. this is helpful to merge codelines between + /// index sorting and the others pub(crate) fn get_doc_id_from_concatenated_data(&self) -> crate::Result { let total_num_new_docs = self .readers @@ -697,8 +698,8 @@ impl IndexMerger { .fast_fields() .typed_fast_field_multi_reader(field) .expect( - "Failed to find multivalued fast field reader. This is a bug in \ - tantivy. Please report.", + "Failed to find multivalued fast field reader. This is a bug in tantivy. \ + Please report.", ); for doc in reader.doc_ids_alive() { ff_reader.get_vals(doc, &mut vals); @@ -792,8 +793,10 @@ impl IndexMerger { .readers .iter() .map(|reader| { - let bytes_reader = reader.fast_fields().bytes(field) - .expect("Failed to find index for bytes field. This is a bug in tantivy, please report."); + let bytes_reader = reader.fast_fields().bytes(field).expect( + "Failed to find index for bytes field. This is a bug in tantivy, please \ + report.", + ); (reader, bytes_reader) }) .collect::>(); @@ -877,8 +880,8 @@ impl IndexMerger { // segment are stacked so that : // - Segment 0's doc ids become doc id [0, seg.max_doc] // - Segment 1's doc ids become [seg0.max_doc, seg0.max_doc + seg.max_doc] - // - Segment 2's doc ids become [seg0.max_doc + seg1.max_doc, - // seg0.max_doc + seg1.max_doc + seg2.max_doc] + // - Segment 2's doc ids become [seg0.max_doc + seg1.max_doc, seg0.max_doc + seg1.max_doc + + // seg2.max_doc] // // This stacking applies only when the index is not sorted, in that case the // doc_ids are kmerged by their sort property @@ -1122,34 +1125,26 @@ impl IndexMerger { #[cfg(test)] mod tests { - use crate::assert_nearly_equals; - use crate::collector::tests::TEST_COLLECTOR_WITH_SCORE; - use crate::collector::tests::{BytesFastFieldTestCollector, FastFieldTestCollector}; - use crate::collector::{Count, FacetCollector}; - use crate::core::Index; - use crate::fastfield::FastFieldReader; - use crate::query::AllQuery; - use crate::query::BooleanQuery; - use crate::query::Scorer; - use crate::query::TermQuery; - use crate::schema::Facet; - use crate::schema::IndexRecordOption; - use crate::schema::IntOptions; - use crate::schema::Term; - use crate::schema::TextFieldIndexing; - use crate::schema::{Cardinality, TEXT}; - use crate::schema::{Document, FacetOptions}; - use crate::DocAddress; - use crate::IndexSettings; - use crate::IndexSortByField; - use crate::IndexWriter; - use crate::Searcher; - use crate::{schema, DocSet, SegmentId}; - use crate::{schema::INDEXED, Order}; use byteorder::{BigEndian, ReadBytesExt}; use futures::executor::block_on; use schema::FAST; + use crate::collector::tests::{ + BytesFastFieldTestCollector, FastFieldTestCollector, TEST_COLLECTOR_WITH_SCORE, + }; + use crate::collector::{Count, FacetCollector}; + use crate::core::Index; + use crate::fastfield::FastFieldReader; + use crate::query::{AllQuery, BooleanQuery, Scorer, TermQuery}; + use crate::schema::{ + Cardinality, Document, Facet, FacetOptions, IndexRecordOption, IntOptions, Term, + TextFieldIndexing, INDEXED, TEXT, + }; + use crate::{ + assert_nearly_equals, schema, DocAddress, DocSet, IndexSettings, IndexSortByField, + IndexWriter, Order, Searcher, SegmentId, + }; + #[test] fn test_index_merger_no_deletes() -> crate::Result<()> { let mut schema_builder = schema::Schema::builder(); @@ -1253,23 +1248,29 @@ mod tests { } { let doc = searcher.doc(DocAddress::new(0, 0))?; - assert_eq!(doc.get_first(text_field).unwrap().text(), Some("af b")); + assert_eq!(doc.get_first(text_field).unwrap().as_text(), Some("af b")); } { let doc = searcher.doc(DocAddress::new(0, 1))?; - assert_eq!(doc.get_first(text_field).unwrap().text(), Some("a b c")); + assert_eq!(doc.get_first(text_field).unwrap().as_text(), Some("a b c")); } { let doc = searcher.doc(DocAddress::new(0, 2))?; - assert_eq!(doc.get_first(text_field).unwrap().text(), Some("a b c d")); + assert_eq!( + doc.get_first(text_field).unwrap().as_text(), + Some("a b c d") + ); } { let doc = searcher.doc(DocAddress::new(0, 3))?; - assert_eq!(doc.get_first(text_field).unwrap().text(), Some("af b")); + assert_eq!(doc.get_first(text_field).unwrap().as_text(), Some("af b")); } { let doc = searcher.doc(DocAddress::new(0, 4))?; - assert_eq!(doc.get_first(text_field).unwrap().text(), Some("a b c g")); + assert_eq!( + doc.get_first(text_field).unwrap().as_text(), + Some("a b c g") + ); } { let get_fast_vals = |terms: Vec| { @@ -1674,7 +1675,7 @@ mod tests { index_builder = index_builder.settings(settings); } let index = index_builder.create_in_ram().unwrap(); - //let index = Index::create_in_ram(schema_builder.build()); + // let index = Index::create_in_ram(schema_builder.build()); let reader = index.reader().unwrap(); let mut int_val = 0; { @@ -1708,7 +1709,9 @@ mod tests { index_doc(&mut index_writer, &["/top/d"], &mut 0); index_doc(&mut index_writer, &["/top/e"], &mut 10); index_writer.commit().expect("committed"); - index_doc(&mut index_writer, &["/top/a"], &mut 5); // 5 is between 0 - 10 so the segments don' have disjunct ranges + index_doc(&mut index_writer, &["/top/a"], &mut 5); // 5 is between 0 - 10 so the + // segments don' have disjunct + // ranges } else { index_doc(&mut index_writer, &["/top/d"], &mut int_val); index_doc(&mut index_writer, &["/top/e"], &mut int_val); @@ -2057,7 +2060,8 @@ mod tests { let mut term_scorer = term_query .specialized_weight(&searcher, true)? .specialized_scorer(segment_reader, 1.0)?; - // the difference compared to before is instrinsic to the bm25 formula. no worries there. + // the difference compared to before is instrinsic to the bm25 formula. no worries + // there. for doc in segment_reader.doc_ids_alive() { assert_eq!(term_scorer.doc(), doc); assert_nearly_equals!(term_scorer.block_max_score(), 0.003478312); diff --git a/src/indexer/merger_sorted_index_test.rs b/src/indexer/merger_sorted_index_test.rs index 42a5fd0e3..73af2fafd 100644 --- a/src/indexer/merger_sorted_index_test.rs +++ b/src/indexer/merger_sorted_index_test.rs @@ -1,19 +1,16 @@ #[cfg(test)] mod tests { + use futures::executor::block_on; + use crate::collector::TopDocs; use crate::core::Index; - use crate::fastfield::MultiValuedFastFieldReader; - use crate::fastfield::{AliveBitSet, FastFieldReader}; + use crate::fastfield::{AliveBitSet, FastFieldReader, MultiValuedFastFieldReader}; use crate::query::QueryParser; use crate::schema::{ - self, BytesOptions, Cardinality, Facet, FacetOptions, IndexRecordOption, TextFieldIndexing, + self, BytesOptions, Cardinality, Facet, FacetOptions, IndexRecordOption, IntOptions, + TextFieldIndexing, TextOptions, }; - use crate::schema::{IntOptions, TextOptions}; - use crate::DocAddress; - use crate::IndexSortByField; - use crate::Order; - use crate::{DocSet, IndexSettings, Postings, Term}; - use futures::executor::block_on; + use crate::{DocAddress, DocSet, IndexSettings, IndexSortByField, Order, Postings, Term}; fn create_test_index_posting_list_issue(index_settings: Option) -> Index { let mut schema_builder = schema::Schema::builder(); @@ -59,8 +56,8 @@ mod tests { index } - // force_disjunct_segment_sort_values forces the field, by which the index is sorted have disjunct - // ranges between segments, e.g. values in segment [1-3] [10 - 20] [50 - 500] + // force_disjunct_segment_sort_values forces the field, by which the index is sorted have + // disjunct ranges between segments, e.g. values in segment [1-3] [10 - 20] [50 - 500] fn create_test_index( index_settings: Option, force_disjunct_segment_sort_values: bool, @@ -282,11 +279,11 @@ mod tests { }; let doc = searcher.doc(DocAddress::new(0, blubber_pos)).unwrap(); assert_eq!( - doc.get_first(my_text_field).unwrap().text(), + doc.get_first(my_text_field).unwrap().as_text(), Some("blubber") ); let doc = searcher.doc(DocAddress::new(0, 0)).unwrap(); - assert_eq!(doc.get_first(int_field).unwrap().u64_value(), Some(1000)); + assert_eq!(doc.get_first(int_field).unwrap().as_u64(), Some(1000)); } } @@ -465,17 +462,17 @@ mod tests { // access doc store { let doc = searcher.doc(DocAddress::new(0, 0)).unwrap(); - assert_eq!(doc.get_first(int_field).unwrap().u64_value(), Some(1)); + assert_eq!(doc.get_first(int_field).unwrap().as_u64(), Some(1)); let doc = searcher.doc(DocAddress::new(0, 1)).unwrap(); - assert_eq!(doc.get_first(int_field).unwrap().u64_value(), Some(2)); + assert_eq!(doc.get_first(int_field).unwrap().as_u64(), Some(2)); let doc = searcher.doc(DocAddress::new(0, 2)).unwrap(); - assert_eq!(doc.get_first(int_field).unwrap().u64_value(), Some(3)); + assert_eq!(doc.get_first(int_field).unwrap().as_u64(), Some(3)); let doc = searcher.doc(DocAddress::new(0, 3)).unwrap(); - assert_eq!(doc.get_first(int_field).unwrap().u64_value(), Some(10)); + assert_eq!(doc.get_first(int_field).unwrap().as_u64(), Some(10)); let doc = searcher.doc(DocAddress::new(0, 4)).unwrap(); - assert_eq!(doc.get_first(int_field).unwrap().u64_value(), Some(20)); + assert_eq!(doc.get_first(int_field).unwrap().as_u64(), Some(20)); let doc = searcher.doc(DocAddress::new(0, 5)).unwrap(); - assert_eq!(doc.get_first(int_field).unwrap().u64_value(), Some(1_000)); + assert_eq!(doc.get_first(int_field).unwrap().as_u64(), Some(1_000)); } } } @@ -483,20 +480,14 @@ mod tests { #[cfg(all(test, feature = "unstable"))] mod bench_sorted_index_merge { - use crate::core::Index; - //use cratedoc_id, readerdoc_id_mappinglet vals = reader.fate::schema; - use crate::fastfield::DynamicFastFieldReader; - use crate::fastfield::FastFieldReader; - use crate::indexer::merger::IndexMerger; - use crate::schema::Cardinality; - use crate::schema::Document; - use crate::schema::IntOptions; - use crate::schema::Schema; - use crate::IndexSettings; - use crate::IndexSortByField; - use crate::IndexWriter; - use crate::Order; use test::{self, Bencher}; + + use crate::core::Index; + // use cratedoc_id, readerdoc_id_mappinglet vals = reader.fate::schema; + use crate::fastfield::{DynamicFastFieldReader, FastFieldReader}; + use crate::indexer::merger::IndexMerger; + use crate::schema::{Cardinality, Document, IntOptions, Schema}; + use crate::{IndexSettings, IndexSortByField, IndexWriter, Order}; fn create_index(sort_by_field: Option) -> Index { let mut schema_builder = Schema::builder(); let int_options = IntOptions::default() @@ -544,13 +535,13 @@ mod bench_sorted_index_merge { IndexMerger::open(index.schema(), index.settings().clone(), &segments[..])?; let doc_id_mapping = merger.generate_doc_id_mapping(&sort_by_field).unwrap(); b.iter(|| { - - let sorted_doc_ids = doc_id_mapping.iter().map(|(doc_id, ordinal)|{ - let reader = &merger.readers[*ordinal as usize]; - let u64_reader: DynamicFastFieldReader = reader - .fast_fields() - .typed_fast_field_reader(field) - .expect("Failed to find a reader for single fast field. This is a tantivy bug and it should never happen."); + let sorted_doc_ids = doc_id_mapping.iter().map(|(doc_id, ordinal)| { + let reader = &merger.readers[*ordinal as usize]; + let u64_reader: DynamicFastFieldReader = + reader.fast_fields().typed_fast_field_reader(field).expect( + "Failed to find a reader for single fast field. This is a tantivy bug and \ + it should never happen.", + ); (doc_id, reader, u64_reader) }); // add values in order of the new doc_ids @@ -560,7 +551,6 @@ mod bench_sorted_index_merge { } val - }); Ok(()) @@ -572,7 +562,7 @@ mod bench_sorted_index_merge { order: Order::Desc, }; let index = create_index(Some(sort_by_field.clone())); - //let field = index.schema().get_field("intval").unwrap(); + // let field = index.schema().get_field("intval").unwrap(); let segments = index.searchable_segments().unwrap(); let merger: IndexMerger = IndexMerger::open(index.schema(), index.settings().clone(), &segments[..])?; diff --git a/src/indexer/mod.rs b/src/indexer/mod.rs index d8bec5200..d8f9b0568 100644 --- a/src/indexer/mod.rs +++ b/src/indexer/mod.rs @@ -23,8 +23,6 @@ mod stamper; use crossbeam::channel; use smallvec::SmallVec; -use crate::indexer::operation::AddOperation; - pub use self::index_writer::IndexWriter; pub use self::log_merge_policy::LogMergePolicy; pub use self::merge_operation::MergeOperation; @@ -33,9 +31,9 @@ pub use self::prepared_commit::PreparedCommit; pub use self::segment_entry::SegmentEntry; pub use self::segment_manager::SegmentManager; pub use self::segment_serializer::SegmentSerializer; -pub use self::segment_updater::merge_filtered_segments; -pub use self::segment_updater::merge_indices; +pub use self::segment_updater::{merge_filtered_segments, merge_indices}; pub use self::segment_writer::SegmentWriter; +use crate::indexer::operation::AddOperation; /// Alias for the default merge policy, which is the `LogMergePolicy`. pub type DefaultMergePolicy = LogMergePolicy; diff --git a/src/indexer/operation.rs b/src/indexer/operation.rs index e6dc33f10..e0505be1d 100644 --- a/src/indexer/operation.rs +++ b/src/indexer/operation.rs @@ -1,5 +1,4 @@ -use crate::schema::Document; -use crate::schema::Term; +use crate::schema::{Document, Term}; use crate::Opstamp; /// Timestamped Delete operation. diff --git a/src/indexer/prepared_commit.rs b/src/indexer/prepared_commit.rs index 4ad71178a..13d2cfa06 100644 --- a/src/indexer/prepared_commit.rs +++ b/src/indexer/prepared_commit.rs @@ -1,6 +1,7 @@ +use futures::executor::block_on; + use super::IndexWriter; use crate::Opstamp; -use futures::executor::block_on; /// A prepared commit pub struct PreparedCommit<'a> { diff --git a/src/indexer/segment_entry.rs b/src/indexer/segment_entry.rs index b7cae25c1..0e5002338 100644 --- a/src/indexer/segment_entry.rs +++ b/src/indexer/segment_entry.rs @@ -1,9 +1,10 @@ -use crate::core::SegmentId; -use crate::core::SegmentMeta; -use crate::indexer::delete_queue::DeleteCursor; -use common::BitSet; use std::fmt; +use common::BitSet; + +use crate::core::{SegmentId, SegmentMeta}; +use crate::indexer::delete_queue::DeleteCursor; + /// A segment entry describes the state of /// a given segment, at a given instant. /// diff --git a/src/indexer/segment_manager.rs b/src/indexer/segment_manager.rs index 0f2b634c8..cbebd2e20 100644 --- a/src/indexer/segment_manager.rs +++ b/src/indexer/segment_manager.rs @@ -1,13 +1,12 @@ +use std::collections::hash_set::HashSet; +use std::fmt::{self, Debug, Formatter}; +use std::sync::{RwLock, RwLockReadGuard, RwLockWriteGuard}; + use super::segment_register::SegmentRegister; -use crate::core::SegmentId; -use crate::core::SegmentMeta; +use crate::core::{SegmentId, SegmentMeta}; use crate::error::TantivyError; use crate::indexer::delete_queue::DeleteCursor; use crate::indexer::SegmentEntry; -use std::collections::hash_set::HashSet; -use std::fmt::{self, Debug, Formatter}; -use std::sync::RwLock; -use std::sync::{RwLockReadGuard, RwLockWriteGuard}; #[derive(Default)] struct SegmentRegisters { @@ -154,21 +153,23 @@ impl SegmentManager { let mut segment_entries = vec![]; if registers_lock.uncommitted.contains_all(segment_ids) { for segment_id in segment_ids { - let segment_entry = registers_lock.uncommitted - .get(segment_id) - .expect("Segment id not found {}. Should never happen because of the contains all if-block."); + let segment_entry = registers_lock.uncommitted.get(segment_id).expect( + "Segment id not found {}. Should never happen because of the contains all \ + if-block.", + ); segment_entries.push(segment_entry); } } else if registers_lock.committed.contains_all(segment_ids) { for segment_id in segment_ids { - let segment_entry = registers_lock.committed - .get(segment_id) - .expect("Segment id not found {}. Should never happen because of the contains all if-block."); + let segment_entry = registers_lock.committed.get(segment_id).expect( + "Segment id not found {}. Should never happen because of the contains all \ + if-block.", + ); segment_entries.push(segment_entry); } } else { - let error_msg = "Merge operation sent for segments that are not \ - all uncommited or commited." + let error_msg = "Merge operation sent for segments that are not all uncommited or \ + commited." .to_string(); return Err(TantivyError::InvalidArgument(error_msg)); } @@ -193,8 +194,8 @@ impl SegmentManager { .ok_or_else(|| { warn!("couldn't find segment in SegmentManager"); crate::TantivyError::InvalidArgument( - "The segments that were merged could not be found in the SegmentManager. \ - This is not necessarily a bug, and can happen after a rollback for instance." + "The segments that were merged could not be found in the SegmentManager. This \ + is not necessarily a bug, and can happen after a rollback for instance." .to_string(), ) })?; diff --git a/src/indexer/segment_register.rs b/src/indexer/segment_register.rs index 75a5316e4..0068d598b 100644 --- a/src/indexer/segment_register.rs +++ b/src/indexer/segment_register.rs @@ -1,11 +1,9 @@ -use crate::core::SegmentId; -use crate::core::SegmentMeta; +use std::collections::{HashMap, HashSet}; +use std::fmt::{self, Debug, Display, Formatter}; + +use crate::core::{SegmentId, SegmentMeta}; use crate::indexer::delete_queue::DeleteCursor; use crate::indexer::segment_entry::SegmentEntry; -use std::collections::HashMap; -use std::collections::HashSet; -use std::fmt::Display; -use std::fmt::{self, Debug, Formatter}; /// The segment register keeps track /// of the list of segment, their size as well diff --git a/src/indexer/segment_serializer.rs b/src/indexer/segment_serializer.rs index 63ccb530c..923b5b6dd 100644 --- a/src/indexer/segment_serializer.rs +++ b/src/indexer/segment_serializer.rs @@ -1,5 +1,4 @@ -use crate::core::Segment; -use crate::core::SegmentComponent; +use crate::core::{Segment, SegmentComponent}; use crate::fastfield::CompositeFastFieldSerializer; use crate::fieldnorm::FieldNormsSerializer; use crate::postings::InvertedIndexSerializer; diff --git a/src/indexer/segment_updater.rs b/src/indexer/segment_updater.rs index cae64d796..5dad00b6d 100644 --- a/src/indexer/segment_updater.rs +++ b/src/indexer/segment_updater.rs @@ -1,11 +1,21 @@ +use std::borrow::BorrowMut; +use std::collections::HashSet; +use std::io; +use std::io::Write; +use std::ops::Deref; +use std::path::PathBuf; +use std::sync::atomic::{AtomicBool, Ordering}; +use std::sync::{Arc, RwLock}; + +use fail::fail_point; +use futures::channel::oneshot; +use futures::executor::{ThreadPool, ThreadPoolBuilder}; +use futures::future::{Future, TryFutureExt}; + use super::segment_manager::SegmentManager; -use crate::core::Index; -use crate::core::IndexMeta; -use crate::core::IndexSettings; -use crate::core::Segment; -use crate::core::SegmentId; -use crate::core::SegmentMeta; -use crate::core::META_FILEPATH; +use crate::core::{ + Index, IndexMeta, IndexSettings, Segment, SegmentId, SegmentMeta, META_FILEPATH, +}; use crate::directory::{Directory, DirectoryClone, GarbageCollectionResult}; use crate::fastfield::AliveBitSet; use crate::indexer::delete_queue::DeleteCursor; @@ -14,27 +24,12 @@ use crate::indexer::merge_operation::MergeOperationInventory; use crate::indexer::merger::IndexMerger; use crate::indexer::segment_manager::SegmentsStatus; use crate::indexer::stamper::Stamper; -use crate::indexer::SegmentEntry; -use crate::indexer::SegmentSerializer; -use crate::indexer::{DefaultMergePolicy, MergePolicy}; -use crate::indexer::{MergeCandidate, MergeOperation}; +use crate::indexer::{ + DefaultMergePolicy, MergeCandidate, MergeOperation, MergePolicy, SegmentEntry, + SegmentSerializer, +}; use crate::schema::Schema; -use crate::Opstamp; -use crate::TantivyError; -use fail::fail_point; -use futures::channel::oneshot; -use futures::executor::{ThreadPool, ThreadPoolBuilder}; -use futures::future::Future; -use futures::future::TryFutureExt; -use std::borrow::BorrowMut; -use std::collections::HashSet; -use std::io; -use std::io::Write; -use std::ops::Deref; -use std::path::PathBuf; -use std::sync::atomic::{AtomicBool, Ordering}; -use std::sync::Arc; -use std::sync::RwLock; +use crate::{Opstamp, TantivyError}; const NUM_MERGE_THREADS: usize = 4; @@ -702,10 +697,7 @@ mod tests { use crate::indexer::segment_updater::merge_filtered_segments; use crate::query::QueryParser; use crate::schema::*; - use crate::Directory; - use crate::DocAddress; - use crate::Index; - use crate::Segment; + use crate::{Directory, DocAddress, Index, Segment}; #[test] fn test_delete_during_merge() -> crate::Result<()> { diff --git a/src/indexer/segment_writer.rs b/src/indexer/segment_writer.rs index eb40c86ef..fd7194d62 100644 --- a/src/indexer/segment_writer.rs +++ b/src/indexer/segment_writer.rs @@ -1,24 +1,16 @@ -use super::{ - doc_id_mapping::{get_doc_id_mapping_from_field, DocIdMapping}, - operation::AddOperation, -}; +use super::doc_id_mapping::{get_doc_id_mapping_from_field, DocIdMapping}; +use super::operation::AddOperation; +use crate::core::Segment; +use crate::fastfield::FastFieldsWriter; use crate::fieldnorm::{FieldNormReaders, FieldNormsWriter}; use crate::indexer::segment_serializer::SegmentSerializer; -use crate::postings::compute_table_size; -use crate::postings::MultiFieldPostingsWriter; -use crate::schema::FieldType; -use crate::schema::Schema; -use crate::schema::Term; -use crate::schema::Value; -use crate::schema::{Field, FieldEntry}; -use crate::store::StoreReader; -use crate::tokenizer::{BoxTokenStream, PreTokenizedStream}; -use crate::tokenizer::{FacetTokenizer, TextAnalyzer}; -use crate::tokenizer::{TokenStreamChain, Tokenizer}; -use crate::Opstamp; -use crate::{core::Segment, store::StoreWriter}; -use crate::{fastfield::FastFieldsWriter, schema::Type}; -use crate::{DocId, SegmentComponent}; +use crate::postings::{compute_table_size, MultiFieldPostingsWriter}; +use crate::schema::{Field, FieldEntry, FieldType, Schema, Term, Type, Value}; +use crate::store::{StoreReader, StoreWriter}; +use crate::tokenizer::{ + BoxTokenStream, FacetTokenizer, PreTokenizedStream, TextAnalyzer, TokenStreamChain, Tokenizer, +}; +use crate::{DocId, Opstamp, SegmentComponent}; /// Computes the initial size of the hash table. /// @@ -31,8 +23,11 @@ fn initial_table_size(per_thread_memory_budget: usize) -> crate::Result { { Ok(limit.min(19)) // we cap it at 2^19 = 512K. } else { - Err(crate::TantivyError::InvalidArgument( - format!("per thread memory budget (={}) is too small. Raise the memory budget or lower the number of threads.", per_thread_memory_budget))) + Err(crate::TantivyError::InvalidArgument(format!( + "per thread memory budget (={}) is too small. Raise the memory budget or lower the \ + number of threads.", + per_thread_memory_budget + ))) } } @@ -158,7 +153,7 @@ impl SegmentWriter { self.fast_field_writers.add_document(&doc); - for (field, field_values) in doc.get_sorted_field_values() { + for (field, values) in doc.get_sorted_field_values() { let field_entry = schema.get_field_entry(field); let make_schema_error = || { crate::TantivyError::SchemaError(format!( @@ -175,8 +170,8 @@ impl SegmentWriter { match *field_entry.field_type() { FieldType::Facet(_) => { term_buffer.set_field(Type::Facet, field); - for field_value in field_values { - let facet = field_value.value().facet().ok_or_else(make_schema_error)?; + for value in values { + let facet = value.as_facet().ok_or_else(make_schema_error)?; let facet_str = facet.encoded_str(); let mut unordered_term_id_opt = None; FacetTokenizer @@ -200,8 +195,8 @@ impl SegmentWriter { let mut offsets = vec![]; let mut total_offset = 0; - for field_value in field_values { - match field_value.value() { + for value in values { + match value { Value::PreTokStr(tok_str) => { offsets.push(total_offset); if let Some(last_token) = tok_str.tokens.last() { @@ -237,56 +232,41 @@ impl SegmentWriter { self.fieldnorms_writer.record(doc_id, field, num_tokens); } FieldType::U64(_) => { - for field_value in field_values { - term_buffer.set_field(Type::U64, field_value.field()); - let u64_val = field_value - .value() - .u64_value() - .ok_or_else(make_schema_error)?; + for value in values { + term_buffer.set_field(Type::U64, field); + let u64_val = value.as_u64().ok_or_else(make_schema_error)?; term_buffer.set_u64(u64_val); multifield_postings.subscribe(doc_id, term_buffer); } } FieldType::Date(_) => { - for field_value in field_values { - term_buffer.set_field(Type::Date, field_value.field()); - let date_val = field_value - .value() - .date_value() - .ok_or_else(make_schema_error)?; + for value in values { + term_buffer.set_field(Type::Date, field); + let date_val = value.as_date().ok_or_else(make_schema_error)?; term_buffer.set_i64(date_val.timestamp()); multifield_postings.subscribe(doc_id, term_buffer); } } FieldType::I64(_) => { - for field_value in field_values { - term_buffer.set_field(Type::I64, field_value.field()); - let i64_val = field_value - .value() - .i64_value() - .ok_or_else(make_schema_error)?; + for value in values { + term_buffer.set_field(Type::I64, field); + let i64_val = value.as_i64().ok_or_else(make_schema_error)?; term_buffer.set_i64(i64_val); multifield_postings.subscribe(doc_id, term_buffer); } } FieldType::F64(_) => { - for field_value in field_values { - term_buffer.set_field(Type::F64, field_value.field()); - let f64_val = field_value - .value() - .f64_value() - .ok_or_else(make_schema_error)?; + for value in values { + term_buffer.set_field(Type::F64, field); + let f64_val = value.as_f64().ok_or_else(make_schema_error)?; term_buffer.set_f64(f64_val); multifield_postings.subscribe(doc_id, term_buffer); } } FieldType::Bytes(_) => { - for field_value in field_values { - term_buffer.set_field(Type::Bytes, field_value.field()); - let bytes = field_value - .value() - .bytes_value() - .ok_or_else(make_schema_error)?; + for value in values { + term_buffer.set_field(Type::Bytes, field); + let bytes = value.as_bytes().ok_or_else(make_schema_error)?; term_buffer.set_bytes(bytes); self.multifield_postings.subscribe(doc_id, term_buffer); } diff --git a/src/indexer/stamper.rs b/src/indexer/stamper.rs index fa1842892..5ffd12b99 100644 --- a/src/indexer/stamper.rs +++ b/src/indexer/stamper.rs @@ -1,14 +1,16 @@ -use crate::Opstamp; use std::ops::Range; use std::sync::atomic::Ordering; use std::sync::Arc; +use crate::Opstamp; + #[cfg(not(target_arch = "arm"))] mod atomic_impl { - use crate::Opstamp; use std::sync::atomic::{AtomicU64, Ordering}; + use crate::Opstamp; + #[derive(Default)] pub struct AtomicU64Wrapper(AtomicU64); @@ -31,11 +33,12 @@ mod atomic_impl { #[cfg(target_arch = "arm")] mod atomic_impl { - use crate::Opstamp; /// Under other architecture, we rely on a mutex. use std::sync::atomic::Ordering; use std::sync::RwLock; + use crate::Opstamp; + #[derive(Default)] pub struct AtomicU64Wrapper(RwLock); diff --git a/src/lib.rs b/src/lib.rs index 5b3761adf..676149801 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -125,9 +125,10 @@ mod functional_test; #[macro_use] mod macros; -pub use crate::error::TantivyError; pub use chrono; +pub use crate::error::TantivyError; + /// Tantivy result. /// /// Within tantivy, please avoid importing `Result` using `use crate::Result` @@ -163,29 +164,26 @@ mod snippet; pub use self::snippet::{Snippet, SnippetGenerator}; mod docset; +use std::fmt; + +pub use census::{Inventory, TrackedObject}; +pub use common::{f64_to_u64, i64_to_u64, u64_to_f64, u64_to_i64, HasLen}; +use once_cell::sync::Lazy; +use serde::{Deserialize, Serialize}; + pub use self::docset::{DocSet, TERMINATED}; -pub use crate::core::{Executor, SegmentComponent}; pub use crate::core::{ - Index, IndexBuilder, IndexMeta, IndexSettings, IndexSortByField, Order, Searcher, - SearcherGeneration, Segment, SegmentId, SegmentMeta, + Executor, Index, IndexBuilder, IndexMeta, IndexSettings, IndexSortByField, InvertedIndexReader, + Order, Searcher, SearcherGeneration, Segment, SegmentComponent, SegmentId, SegmentMeta, + SegmentReader, }; -pub use crate::core::{InvertedIndexReader, SegmentReader}; pub use crate::directory::Directory; pub use crate::indexer::demuxer::*; -pub use crate::indexer::merge_filtered_segments; -pub use crate::indexer::merge_indices; pub use crate::indexer::operation::UserOperation; -pub use crate::indexer::{IndexWriter, PreparedCommit}; +pub use crate::indexer::{merge_filtered_segments, merge_indices, IndexWriter, PreparedCommit}; pub use crate::postings::Postings; pub use crate::reader::LeasedItem; pub use crate::schema::{Document, Term}; -pub use census::{Inventory, TrackedObject}; -pub use common::HasLen; -pub use common::{f64_to_u64, i64_to_u64, u64_to_f64, u64_to_i64}; -use std::fmt; - -use once_cell::sync::Lazy; -use serde::{Deserialize, Serialize}; /// Index format version. const INDEX_FORMAT_VERSION: u32 = 4; @@ -237,11 +235,9 @@ pub fn version_string() -> &'static str { /// Defines tantivy's merging strategy pub mod merge_policy { - pub use crate::indexer::DefaultMergePolicy; - pub use crate::indexer::LogMergePolicy; - pub use crate::indexer::MergeCandidate; - pub use crate::indexer::MergePolicy; - pub use crate::indexer::NoMergePolicy; + pub use crate::indexer::{ + DefaultMergePolicy, LogMergePolicy, MergeCandidate, MergePolicy, NoMergePolicy, + }; } /// A `u32` identifying a document within a segment. @@ -299,21 +295,18 @@ pub struct DocAddress { #[cfg(test)] pub mod tests { + use common::{BinarySerializable, FixedSize}; + use rand::distributions::{Bernoulli, Uniform}; + use rand::rngs::StdRng; + use rand::{Rng, SeedableRng}; + use crate::collector::tests::TEST_COLLECTOR_WITH_SCORE; use crate::core::SegmentReader; use crate::docset::{DocSet, TERMINATED}; use crate::fastfield::FastFieldReader; use crate::query::BooleanQuery; use crate::schema::*; - use crate::DocAddress; - use crate::Index; - use crate::Postings; - use crate::ReloadPolicy; - use common::{BinarySerializable, FixedSize}; - use rand::distributions::Bernoulli; - use rand::distributions::Uniform; - use rand::rngs::StdRng; - use rand::{Rng, SeedableRng}; + use crate::{DocAddress, Index, Postings, ReloadPolicy}; pub fn fixed_size_test() { let mut buffer = Vec::new(); @@ -868,11 +861,11 @@ pub mod tests { assert_eq!(document.len(), 3); let values: Vec<&Value> = document.get_all(text_field).collect(); assert_eq!(values.len(), 2); - assert_eq!(values[0].text(), Some("tantivy")); - assert_eq!(values[1].text(), Some("some other value")); + assert_eq!(values[0].as_text(), Some("tantivy")); + assert_eq!(values[1].as_text(), Some("some other value")); let values: Vec<&Value> = document.get_all(other_text_field).collect(); assert_eq!(values.len(), 1); - assert_eq!(values[0].text(), Some("short")); + assert_eq!(values[0].as_text(), Some("short")); } #[test] @@ -938,11 +931,12 @@ pub mod tests { // motivated by #729 #[test] fn test_update_via_delete_insert() -> crate::Result<()> { + use futures::executor::block_on; + use crate::collector::Count; use crate::indexer::NoMergePolicy; use crate::query::AllQuery; use crate::SegmentId; - use futures::executor::block_on; const DOC_COUNT: u64 = 2u64; diff --git a/src/positions/mod.rs b/src/positions/mod.rs index aba28f808..fd3297bf9 100644 --- a/src/positions/mod.rs +++ b/src/positions/mod.rs @@ -7,17 +7,20 @@ //! for all terms of a given field, one term after the other. //! //! Each terms is encoded independently. -//! Like for positing lists, tantivy rely on simd bitpacking to encode the positions delta in blocks of 128 deltas. -//! Because we rarely have a multiple of 128, a final block may encode the remaining values variable byte encoding. +//! Like for positing lists, tantivy rely on simd bitpacking to encode the positions delta in blocks +//! of 128 deltas. Because we rarely have a multiple of 128, a final block may encode the remaining +//! values variable byte encoding. //! -//! In order to make reading possible, the term delta positions first encodes the number of bitpacked blocks, -//! then the bitwidth for each blocks, then the actual bitpacked block and finally the final variable int encoded block. +//! In order to make reading possible, the term delta positions first encodes the number of +//! bitpacked blocks, then the bitwidth for each blocks, then the actual bitpacked block and finally +//! the final variable int encoded block. //! -//! Contrary to postings list, the reader does not have access on the number of positions that is encoded, and instead -//! stops decoding the last block when its byte slice has been entirely read. +//! Contrary to postings list, the reader does not have access on the number of positions that is +//! encoded, and instead stops decoding the last block when its byte slice has been entirely read. //! //! More formally: -//! * *Positions* := *NumBitPackedBlocks* *BitPackedPositionBlock*^(P/128) *BitPackedPositionsDeltaBitWidth* *VIntPosDeltas*? +//! * *Positions* := *NumBitPackedBlocks* *BitPackedPositionBlock*^(P/128) +//! *BitPackedPositionsDeltaBitWidth* *VIntPosDeltas*? //! * *NumBitPackedBlocks**: := *P* / 128 encoded as a variable byte integer. //! * *BitPackedPositionBlock* := bit width encoded block of 128 positions delta //! * *BitPackedPositionsDeltaBitWidth* := (*BitWidth*: u8)^*NumBitPackedBlocks* @@ -27,21 +30,24 @@ mod reader; mod serializer; +use bitpacking::{BitPacker, BitPacker4x}; + pub use self::reader::PositionReader; pub use self::serializer::PositionSerializer; -use bitpacking::{BitPacker, BitPacker4x}; const COMPRESSION_BLOCK_SIZE: usize = BitPacker4x::BLOCK_LEN; #[cfg(test)] pub mod tests { + use std::iter; + + use proptest::prelude::*; + use proptest::sample::select; + use super::PositionSerializer; use crate::directory::OwnedBytes; use crate::positions::reader::PositionReader; - use proptest::prelude::*; - use proptest::sample::select; - use std::iter; fn create_positions_data(vals: &[u32]) -> crate::Result { let mut positions_buffer = vec![]; diff --git a/src/positions/reader.rs b/src/positions/reader.rs index 25f857cc1..73d5cd56b 100644 --- a/src/positions/reader.rs +++ b/src/positions/reader.rs @@ -1,9 +1,10 @@ use std::io; +use common::{BinarySerializable, VInt}; + use crate::directory::OwnedBytes; use crate::positions::COMPRESSION_BLOCK_SIZE; use crate::postings::compression::{BlockDecoder, VIntDecoder}; -use common::{BinarySerializable, VInt}; /// When accessing the position of a term, we get a positions_idx from the `Terminfo`. /// This means we need to skip to the `nth` positions efficiently. diff --git a/src/positions/serializer.rs b/src/positions/serializer.rs index 23f242335..d54bbb292 100644 --- a/src/positions/serializer.rs +++ b/src/positions/serializer.rs @@ -1,9 +1,10 @@ -use crate::positions::COMPRESSION_BLOCK_SIZE; -use crate::postings::compression::BlockEncoder; -use crate::postings::compression::VIntEncoder; -use common::{BinarySerializable, CountingWriter, VInt}; use std::io::{self, Write}; +use common::{BinarySerializable, CountingWriter, VInt}; + +use crate::positions::COMPRESSION_BLOCK_SIZE; +use crate::postings::compression::{BlockEncoder, VIntEncoder}; + /// The PositionSerializer is in charge of serializing all of the positions /// of all of the terms of a given field. /// diff --git a/src/postings/block_search.rs b/src/postings/block_search.rs index f53eca38f..3a53bd78f 100644 --- a/src/postings/block_search.rs +++ b/src/postings/block_search.rs @@ -10,7 +10,7 @@ use crate::postings::compression::COMPRESSION_BLOCK_SIZE; // .take_while(|&&val| val < target) // .count() /// ``` -/// +/// /// the `start` argument is just used to hint that the response is /// greater than beyond `start`. the implementation may or may not use /// it for optimization. @@ -35,11 +35,13 @@ pub fn branchless_binary_search(arr: &[u32; COMPRESSION_BLOCK_SIZE], target: u32 #[cfg(test)] mod tests { + use std::collections::HashSet; + + use proptest::prelude::*; + use super::branchless_binary_search; use crate::docset::TERMINATED; use crate::postings::compression::COMPRESSION_BLOCK_SIZE; - use proptest::prelude::*; - use std::collections::HashSet; fn search_in_block_trivial_but_slow(block: &[u32], target: u32) -> usize { block.iter().take_while(|&&val| val < target).count() diff --git a/src/postings/block_segment_postings.rs b/src/postings/block_segment_postings.rs index e899bafb4..35d03156e 100644 --- a/src/postings/block_segment_postings.rs +++ b/src/postings/block_segment_postings.rs @@ -1,14 +1,14 @@ use std::io; -use crate::directory::FileSlice; -use crate::directory::OwnedBytes; +use common::{BinarySerializable, VInt}; + +use crate::directory::{FileSlice, OwnedBytes}; use crate::fieldnorm::FieldNormReader; use crate::postings::compression::{BlockDecoder, VIntDecoder, COMPRESSION_BLOCK_SIZE}; use crate::postings::{BlockInfo, FreqReadingOption, SkipReader}; use crate::query::Bm25Weight; use crate::schema::IndexRecordOption; use crate::{DocId, Score, TERMINATED}; -use common::{BinarySerializable, VInt}; fn max_score>(mut it: I) -> Option { it.next().map(|first| it.fold(first, Score::max)) @@ -346,18 +346,16 @@ impl BlockSegmentPostings { #[cfg(test)] mod tests { + use common::HasLen; + use super::BlockSegmentPostings; use crate::core::Index; use crate::docset::{DocSet, TERMINATED}; use crate::postings::compression::COMPRESSION_BLOCK_SIZE; use crate::postings::postings::Postings; use crate::postings::SegmentPostings; - use crate::schema::IndexRecordOption; - use crate::schema::Schema; - use crate::schema::Term; - use crate::schema::INDEXED; + use crate::schema::{IndexRecordOption, Schema, Term, INDEXED}; use crate::DocId; - use common::HasLen; #[test] fn test_empty_segment_postings() { diff --git a/src/postings/compression/mod.rs b/src/postings/compression/mod.rs index 84a250b65..cfd353ec0 100644 --- a/src/postings/compression/mod.rs +++ b/src/postings/compression/mod.rs @@ -310,12 +310,12 @@ pub mod tests { #[cfg(all(test, feature = "unstable"))] mod bench { + use rand::rngs::StdRng; + use rand::{Rng, SeedableRng}; + use test::Bencher; + use super::*; use crate::TERMINATED; - use rand::rngs::StdRng; - use rand::Rng; - use rand::SeedableRng; - use test::Bencher; fn generate_array_with_seed(n: usize, ratio: f64, seed_val: u8) -> Vec { let mut seed: [u8; 32] = [0; 32]; @@ -349,16 +349,15 @@ mod bench { } //#[test] - //fn test_all_docs_compression_numbits() { - //for expected_num_bits in 0u8.. { - //let mut data = [0u32; 128]; - //if expected_num_bits > 0 { - //data[0] = (1u64 << (expected_num_bits as usize) - 1) as u32; - //} - //let mut encoder = BlockEncoder::new(); - //let (num_bits, compressed) = encoder.compress_block_unsorted(&data); - //assert_eq!(compressed.len(), compressed_block_size(num_bits)); + // fn test_all_docs_compression_numbits() { + // for expected_num_bits in 0u8.. { + // let mut data = [0u32; 128]; + // if expected_num_bits > 0 { + // data[0] = (1u64 << (expected_num_bits as usize) - 1) as u32; //} + // let mut encoder = BlockEncoder::new(); + // let (num_bits, compressed) = encoder.compress_block_unsorted(&data); + // assert_eq!(compressed.len(), compressed_block_size(num_bits)); //} const NUM_INTS_BENCH_VINT: usize = 10; diff --git a/src/postings/mod.rs b/src/postings/mod.rs index a115ca438..a66fe5f74 100644 --- a/src/postings/mod.rs +++ b/src/postings/mod.rs @@ -1,6 +1,4 @@ -/*! -Postings module (also called inverted index) -*/ +//! Postings module (also called inverted index) mod block_search; @@ -38,24 +36,20 @@ pub(crate) enum FreqReadingOption { #[cfg(test)] pub mod tests { - use super::InvertedIndexSerializer; - use super::Postings; - use crate::core::Index; - use crate::core::SegmentComponent; - use crate::core::SegmentReader; + use std::mem; + + use super::{InvertedIndexSerializer, Postings}; + use crate::core::{Index, SegmentComponent, SegmentReader}; use crate::docset::{DocSet, TERMINATED}; use crate::fieldnorm::FieldNormReader; use crate::indexer::operation::AddOperation; use crate::indexer::SegmentWriter; use crate::query::Scorer; - use crate::schema::{Field, TextOptions}; - use crate::schema::{IndexRecordOption, TextFieldIndexing}; - use crate::schema::{Schema, Term, INDEXED, TEXT}; + use crate::schema::{ + Field, IndexRecordOption, Schema, Term, TextFieldIndexing, TextOptions, INDEXED, TEXT, + }; use crate::tokenizer::{SimpleTokenizer, MAX_TOKEN_LEN}; - use crate::DocId; - use crate::HasLen; - use crate::Score; - use std::mem; + use crate::{DocId, HasLen, Score}; #[test] pub fn test_position_write() -> crate::Result<()> { @@ -565,18 +559,16 @@ pub mod tests { #[cfg(all(test, feature = "unstable"))] mod bench { - use crate::docset::TERMINATED; - use crate::query::Intersection; - use crate::schema::IndexRecordOption; - use crate::schema::{Document, Field, Schema, Term, STRING}; - use crate::tests; - use crate::DocSet; - use crate::Index; use once_cell::sync::Lazy; use rand::rngs::StdRng; use rand::{Rng, SeedableRng}; use test::{self, Bencher}; + use crate::docset::TERMINATED; + use crate::query::Intersection; + use crate::schema::{Document, Field, IndexRecordOption, Schema, Term, STRING}; + use crate::{tests, DocSet, Index}; + pub static TERM_A: Lazy = Lazy::new(|| { let field = Field::from_field_id(0); Term::from_field_text(field, "a") diff --git a/src/postings/postings_writer.rs b/src/postings/postings_writer.rs index 1a7d45587..72726888d 100644 --- a/src/postings/postings_writer.rs +++ b/src/postings/postings_writer.rs @@ -1,23 +1,22 @@ -use super::stacker::{Addr, MemoryArena, TermHashMap}; - -use crate::postings::recorder::{ - BufferLender, NothingRecorder, Recorder, TermFrequencyRecorder, TfAndPositionRecorder, -}; -use crate::postings::UnorderedTermId; -use crate::postings::{FieldSerializer, InvertedIndexSerializer}; -use crate::schema::{Field, FieldEntry, FieldType, Schema, Term}; -use crate::schema::{IndexRecordOption, Type}; -use crate::termdict::TermOrdinal; -use crate::tokenizer::TokenStream; -use crate::tokenizer::{Token, MAX_TOKEN_LEN}; -use crate::DocId; -use crate::{fieldnorm::FieldNormReaders, indexer::doc_id_mapping::DocIdMapping}; -use fnv::FnvHashMap; use std::collections::HashMap; use std::io; use std::marker::PhantomData; use std::ops::{DerefMut, Range}; +use fnv::FnvHashMap; + +use super::stacker::{Addr, MemoryArena, TermHashMap}; +use crate::fieldnorm::FieldNormReaders; +use crate::indexer::doc_id_mapping::DocIdMapping; +use crate::postings::recorder::{ + BufferLender, NothingRecorder, Recorder, TermFrequencyRecorder, TfAndPositionRecorder, +}; +use crate::postings::{FieldSerializer, InvertedIndexSerializer, UnorderedTermId}; +use crate::schema::{Field, FieldEntry, FieldType, IndexRecordOption, Schema, Term, Type}; +use crate::termdict::TermOrdinal; +use crate::tokenizer::{Token, TokenStream, MAX_TOKEN_LEN}; +use crate::DocId; + fn posting_from_field_entry(field_entry: &FieldEntry) -> Box { match *field_entry.field_type() { FieldType::Str(ref text_options) => text_options diff --git a/src/postings/recorder.rs b/src/postings/recorder.rs index 11e8447fd..c2c12c560 100644 --- a/src/postings/recorder.rs +++ b/src/postings/recorder.rs @@ -1,8 +1,9 @@ +use common::{read_u32_vint, write_u32_vint}; + use super::stacker::{ExpUnrolledLinkedList, MemoryArena}; use crate::indexer::doc_id_mapping::DocIdMapping; use crate::postings::FieldSerializer; use crate::DocId; -use common::{read_u32_vint, write_u32_vint}; const POSITION_END: u32 = 0; @@ -119,7 +120,7 @@ impl Recorder for NothingRecorder { ) { let (buffer, doc_ids) = buffer_lender.lend_all(); self.stack.read_to_end(heap, buffer); - //TODO avoid reading twice. + // TODO avoid reading twice. if let Some(doc_id_map) = doc_id_map { doc_ids.extend( VInt32Reader::new(&buffer[..]) @@ -299,9 +300,7 @@ impl Recorder for TfAndPositionRecorder { #[cfg(test)] mod tests { - use super::write_u32_vint; - use super::BufferLender; - use super::VInt32Reader; + use super::{write_u32_vint, BufferLender, VInt32Reader}; #[test] fn test_buffer_lender() { diff --git a/src/postings/segment_postings.rs b/src/postings/segment_postings.rs index d42c46786..3fcc84e13 100644 --- a/src/postings/segment_postings.rs +++ b/src/postings/segment_postings.rs @@ -1,12 +1,11 @@ +use common::HasLen; + use crate::docset::DocSet; use crate::fastfield::AliveBitSet; use crate::positions::PositionReader; -use crate::postings::branchless_binary_search; use crate::postings::compression::COMPRESSION_BLOCK_SIZE; -use crate::postings::BlockSegmentPostings; -use crate::postings::Postings; +use crate::postings::{branchless_binary_search, BlockSegmentPostings, Postings}; use crate::{DocId, TERMINATED}; -use common::HasLen; /// `SegmentPostings` represents the inverted list or postings associated to /// a term in a `Segment`. @@ -142,8 +141,7 @@ impl SegmentPostings { /// /// * `len` - number of document in the posting lists. /// * `data` - data array. The complete data is not necessarily used. - /// * `freq_handler` - the freq handler is in charge of decoding - /// frequencies and/or positions + /// * `freq_handler` - the freq handler is in charge of decoding frequencies and/or positions pub(crate) fn from_block_postings( segment_block_postings: BlockSegmentPostings, position_reader: Option, @@ -234,8 +232,7 @@ impl Postings for SegmentPostings { // In that case we hit the block just as if the frequency had been // decoded. The block is simply prefilled by the value 1. self.cur < COMPRESSION_BLOCK_SIZE, - "Have you forgotten to call `.advance()` at least once before calling \ - `.term_freq()`." + "Have you forgotten to call `.advance()` at least once before calling `.term_freq()`." ); self.block_cursor.freq(self.cur) } @@ -264,9 +261,9 @@ impl Postings for SegmentPostings { #[cfg(test)] mod tests { - use super::SegmentPostings; use common::HasLen; + use super::SegmentPostings; use crate::docset::{DocSet, TERMINATED}; use crate::fastfield::AliveBitSet; use crate::postings::postings::Postings; diff --git a/src/postings/serializer.rs b/src/postings/serializer.rs index a2263b5d9..90c37b654 100644 --- a/src/postings/serializer.rs +++ b/src/postings/serializer.rs @@ -1,21 +1,20 @@ +use std::cmp::Ordering; +use std::io::{self, Write}; + +use common::{BinarySerializable, CountingWriter, VInt}; +use fail::fail_point; + use super::TermInfo; use crate::core::Segment; -use crate::directory::CompositeWrite; -use crate::directory::WritePtr; +use crate::directory::{CompositeWrite, WritePtr}; use crate::fieldnorm::FieldNormReader; use crate::positions::PositionSerializer; use crate::postings::compression::{BlockEncoder, VIntEncoder, COMPRESSION_BLOCK_SIZE}; use crate::postings::skip::SkipSerializer; use crate::query::Bm25Weight; -use crate::schema::{Field, FieldEntry, FieldType}; -use crate::schema::{IndexRecordOption, Schema}; +use crate::schema::{Field, FieldEntry, FieldType, IndexRecordOption, Schema}; use crate::termdict::{TermDictionaryBuilder, TermOrdinal}; use crate::{DocId, Score}; -use common::CountingWriter; -use common::{BinarySerializable, VInt}; -use fail::fail_point; -use std::cmp::Ordering; -use std::io::{self, Write}; /// `InvertedIndexSerializer` is in charge of serializing /// postings on disk, in the @@ -172,8 +171,8 @@ impl<'a> FieldSerializer<'a> { } /// Starts the postings for a new term. - /// * term - the term. It needs to come after the previous term according - /// to the lexicographical order. + /// * term - the term. It needs to come after the previous term according to the lexicographical + /// order. /// * term_doc_freq - return the number of document containing the term. pub fn new_term(&mut self, term: &[u8], term_doc_freq: u32) -> io::Result { assert!( @@ -308,8 +307,8 @@ pub struct PostingsSerializer { fieldnorm_reader: Option, bm25_weight: Option, - avg_fieldnorm: Score, // Average number of term in the field for that segment. - // this value is used to compute the block wand information. + avg_fieldnorm: Score, /* Average number of term in the field for that segment. + * this value is used to compute the block wand information. */ } impl PostingsSerializer { diff --git a/src/postings/skip.rs b/src/postings/skip.rs index fb5c78e98..8632140d0 100644 --- a/src/postings/skip.rs +++ b/src/postings/skip.rs @@ -268,9 +268,7 @@ impl SkipReader { #[cfg(test)] mod tests { - use super::BlockInfo; - use super::IndexRecordOption; - use super::{SkipReader, SkipSerializer}; + use super::{BlockInfo, IndexRecordOption, SkipReader, SkipSerializer}; use crate::directory::OwnedBytes; use crate::postings::compression::COMPRESSION_BLOCK_SIZE; diff --git a/src/postings/stacker/expull.rs b/src/postings/stacker/expull.rs index 9dc2a97a3..f0e8c29e4 100644 --- a/src/postings/stacker/expull.rs +++ b/src/postings/stacker/expull.rs @@ -1,9 +1,7 @@ -use super::{Addr, MemoryArena}; +use std::{io, mem}; -use crate::postings::stacker::memory_arena::load; -use crate::postings::stacker::memory_arena::store; -use std::io; -use std::mem; +use super::{Addr, MemoryArena}; +use crate::postings::stacker::memory_arena::{load, store}; const MAX_BLOCK_LEN: u32 = 1u32 << 15; const FIRST_BLOCK: usize = 16; @@ -181,11 +179,11 @@ impl ExpUnrolledLinkedList { #[cfg(test)] mod tests { - use super::super::MemoryArena; - use super::len_to_capacity; - use super::*; use byteorder::{ByteOrder, LittleEndian, WriteBytesExt}; + use super::super::MemoryArena; + use super::{len_to_capacity, *}; + #[test] fn test_stack() { let mut heap = MemoryArena::new(); @@ -303,11 +301,13 @@ mod tests { #[cfg(all(test, feature = "unstable"))] mod bench { + use std::iter; + + use byteorder::{NativeEndian, WriteBytesExt}; + use test::Bencher; + use super::super::MemoryArena; use super::ExpUnrolledLinkedList; - use byteorder::{NativeEndian, WriteBytesExt}; - use std::iter; - use test::Bencher; const NUM_STACK: usize = 10_000; const STACK_SIZE: u32 = 1000; diff --git a/src/postings/stacker/memory_arena.rs b/src/postings/stacker/memory_arena.rs index 0685b2adc..94d24e5dd 100644 --- a/src/postings/stacker/memory_arena.rs +++ b/src/postings/stacker/memory_arena.rs @@ -22,8 +22,7 @@ //! //! Instead, you store and access your data via `.write(...)` and `.read(...)`, which under the hood //! stores your object using `ptr::write_unaligned` and `ptr::read_unaligned`. -use std::mem; -use std::ptr; +use std::{mem, ptr}; const NUM_BITS_PAGE_ADDR: usize = 20; const PAGE_SIZE: usize = 1 << NUM_BITS_PAGE_ADDR; // pages are 1 MB large diff --git a/src/postings/stacker/term_hashmap.rs b/src/postings/stacker/term_hashmap.rs index e19b0d121..70cd24a2b 100644 --- a/src/postings/stacker/term_hashmap.rs +++ b/src/postings/stacker/term_hashmap.rs @@ -1,13 +1,12 @@ +use std::{iter, mem, slice}; + +use byteorder::{ByteOrder, NativeEndian}; use murmurhash32::murmurhash2; use super::{Addr, MemoryArena}; use crate::postings::stacker::memory_arena::store; use crate::postings::UnorderedTermId; use crate::Term; -use byteorder::{ByteOrder, NativeEndian}; -use std::iter; -use std::mem; -use std::slice; /// Returns the actual memory size in bytes /// required to create a table of size $2^num_bits$. @@ -49,7 +48,6 @@ impl KeyValue { /// The quirky API has the benefit of avoiding /// the computation of the hash of the key twice, /// or copying the key as long as there is no insert. -/// pub struct TermHashMap { table: Box<[KeyValue]>, pub heap: MemoryArena, @@ -235,9 +233,10 @@ impl TermHashMap { #[cfg(test)] mod tests { - use super::TermHashMap; use std::collections::HashMap; + use super::TermHashMap; + #[test] fn test_hash_map() { let mut hash_map: TermHashMap = TermHashMap::new(18); diff --git a/src/postings/term_info.rs b/src/postings/term_info.rs index 8703b5589..a5043c912 100644 --- a/src/postings/term_info.rs +++ b/src/postings/term_info.rs @@ -1,8 +1,9 @@ -use common::{BinarySerializable, FixedSize}; use std::io; use std::iter::ExactSizeIterator; use std::ops::Range; +use common::{BinarySerializable, FixedSize}; + /// `TermInfo` wraps the metadata associated to a Term. /// It is segment-local. #[derive(Debug, Default, Eq, PartialEq, Clone)] diff --git a/src/query/all_query.rs b/src/query/all_query.rs index c42d4ffed..f47c33c7d 100644 --- a/src/query/all_query.rs +++ b/src/query/all_query.rs @@ -1,11 +1,9 @@ -use crate::core::Searcher; -use crate::core::SegmentReader; +use crate::core::{Searcher, SegmentReader}; use crate::docset::{DocSet, TERMINATED}; use crate::query::boost_query::BoostScorer; use crate::query::explanation::does_not_match; use crate::query::{Explanation, Query, Scorer, Weight}; -use crate::DocId; -use crate::Score; +use crate::{DocId, Score}; /// Query that matches all of the documents. /// diff --git a/src/query/automaton_weight.rs b/src/query/automaton_weight.rs index 2d14cd693..fb7c2c1cb 100644 --- a/src/query/automaton_weight.rs +++ b/src/query/automaton_weight.rs @@ -1,16 +1,15 @@ -use crate::core::SegmentReader; -use crate::query::ConstScorer; -use crate::query::{BitSetDocSet, Explanation}; -use crate::query::{Scorer, Weight}; -use crate::schema::{Field, IndexRecordOption}; -use crate::termdict::{TermDictionary, TermStreamer}; -use crate::TantivyError; -use crate::{DocId, Score}; -use common::BitSet; use std::io; use std::sync::Arc; + +use common::BitSet; use tantivy_fst::Automaton; +use crate::core::SegmentReader; +use crate::query::{BitSetDocSet, ConstScorer, Explanation, Scorer, Weight}; +use crate::schema::{Field, IndexRecordOption}; +use crate::termdict::{TermDictionary, TermStreamer}; +use crate::{DocId, Score, TantivyError}; + /// A weight struct for Fuzzy Term and Regex Queries pub struct AutomatonWeight { field: Field, @@ -85,12 +84,13 @@ where #[cfg(test)] mod tests { + use tantivy_fst::Automaton; + use super::AutomatonWeight; use crate::docset::TERMINATED; use crate::query::Weight; use crate::schema::{Schema, STRING}; use crate::Index; - use tantivy_fst::Automaton; fn create_index() -> crate::Result { let mut schema = Schema::builder(); diff --git a/src/query/bitset/mod.rs b/src/query/bitset/mod.rs index 030fdeae7..ba4d7a3b1 100644 --- a/src/query/bitset/mod.rs +++ b/src/query/bitset/mod.rs @@ -1,6 +1,7 @@ +use common::{BitSet, TinySet}; + use crate::docset::{DocSet, TERMINATED}; use crate::DocId; -use common::{BitSet, TinySet}; /// A `BitSetDocSet` makes it possible to iterate through a bitset as if it was a `DocSet`. /// @@ -98,11 +99,12 @@ impl DocSet for BitSetDocSet { mod tests { use std::collections::BTreeSet; + use common::BitSet; + use super::BitSetDocSet; use crate::docset::{DocSet, TERMINATED}; use crate::tests::generate_nonunique_unsorted; use crate::DocId; - use common::BitSet; fn create_docbitset(docs: &[DocId], max_doc: DocId) -> BitSetDocSet { let mut docset = BitSet::with_max_value(max_doc); @@ -235,12 +237,9 @@ mod tests { #[cfg(all(test, feature = "unstable"))] mod bench { - use super::BitSet; - use super::BitSetDocSet; + use super::{BitSet, BitSetDocSet}; use crate::docset::TERMINATED; - use crate::test; - use crate::tests; - use crate::DocSet; + use crate::{test, tests, DocSet}; #[bench] fn bench_bitset_1pct_insert(b: &mut test::Bencher) { diff --git a/src/query/bm25.rs b/src/query/bm25.rs index c056ba971..9d47ba929 100644 --- a/src/query/bm25.rs +++ b/src/query/bm25.rs @@ -1,10 +1,8 @@ +use serde::{Deserialize, Serialize}; + use crate::fieldnorm::FieldNormReader; use crate::query::Explanation; -use crate::Score; -use crate::Searcher; -use crate::Term; -use serde::Deserialize; -use serde::Serialize; +use crate::{Score, Searcher, Term}; const K1: Score = 1.2; const B: Score = 0.75; diff --git a/src/query/boolean_query/block_wand.rs b/src/query/boolean_query/block_wand.rs index 020b3dc87..0361751eb 100644 --- a/src/query/boolean_query/block_wand.rs +++ b/src/query/boolean_query/block_wand.rs @@ -1,8 +1,8 @@ +use std::ops::{Deref, DerefMut}; + use crate::query::term_query::TermScorer; use crate::query::Scorer; use crate::{DocId, DocSet, Score, TERMINATED}; -use std::ops::Deref; -use std::ops::DerefMut; /// Takes a term_scorers sorted by their current doc() and a threshold and returns /// Returns (pivot_len, pivot_ord) defined as follows: @@ -216,10 +216,9 @@ pub fn block_wand( /// than the generic algorithm. /// The algorithm behaves as follows: /// - While we don't hit the end of the docset: -/// - While the block max score is under the `threshold`, go to the -/// next block. -/// - On a block, advance until the end and execute `callback`` -/// when the doc score is greater or equal to the `threshold`. +/// - While the block max score is under the `threshold`, go to the next block. +/// - On a block, advance until the end and execute `callback`` when the doc score is greater or +/// equal to the `threshold`. pub fn block_wand_single_scorer( mut scorer: TermScorer, mut threshold: Score, @@ -301,16 +300,17 @@ fn is_sorted>(mut it: I) -> bool { } #[cfg(test)] mod tests { - use crate::query::score_combiner::SumCombiner; - use crate::query::term_query::TermScorer; - use crate::query::Union; - use crate::query::{Bm25Weight, Scorer}; - use crate::{DocId, DocSet, Score, TERMINATED}; - use proptest::prelude::*; use std::cmp::Ordering; use std::collections::BinaryHeap; use std::iter; + use proptest::prelude::*; + + use crate::query::score_combiner::SumCombiner; + use crate::query::term_query::TermScorer; + use crate::query::{Bm25Weight, Scorer, Union}; + use crate::{DocId, DocSet, Score, TERMINATED}; + struct Float(Score); impl Eq for Float {} diff --git a/src/query/boolean_query/boolean_query.rs b/src/query/boolean_query/boolean_query.rs index a819ad125..2719d3371 100644 --- a/src/query/boolean_query/boolean_query.rs +++ b/src/query/boolean_query/boolean_query.rs @@ -1,13 +1,10 @@ -use super::boolean_weight::BooleanWeight; -use crate::query::Occur; -use crate::query::Query; -use crate::query::TermQuery; -use crate::query::Weight; -use crate::schema::IndexRecordOption; -use crate::schema::Term; -use crate::Searcher; use std::collections::BTreeMap; +use super::boolean_weight::BooleanWeight; +use crate::query::{Occur, Query, TermQuery, Weight}; +use crate::schema::{IndexRecordOption, Term}; +use crate::Searcher; + /// The boolean query returns a set of documents /// that matches the Boolean combination of constituent subqueries. /// @@ -24,14 +21,14 @@ use std::collections::BTreeMap; /// You can combine other query types and their `Occur`ances into one `BooleanQuery` /// /// ```rust -///use tantivy::collector::Count; -///use tantivy::doc; -///use tantivy::query::{BooleanQuery, Occur, PhraseQuery, Query, TermQuery}; -///use tantivy::schema::{IndexRecordOption, Schema, TEXT}; -///use tantivy::Term; -///use tantivy::Index; +/// use tantivy::collector::Count; +/// use tantivy::doc; +/// use tantivy::query::{BooleanQuery, Occur, PhraseQuery, Query, TermQuery}; +/// use tantivy::schema::{IndexRecordOption, Schema, TEXT}; +/// use tantivy::Term; +/// use tantivy::Index; /// -///fn main() -> tantivy::Result<()> { +/// fn main() -> tantivy::Result<()> { /// let mut schema_builder = Schema::builder(); /// let title = schema_builder.add_text_field("title", TEXT); /// let body = schema_builder.add_text_field("body", TEXT); @@ -124,7 +121,7 @@ use std::collections::BTreeMap; /// let count4 = searcher.search(&nested_query, &Count)?; /// assert_eq!(count4, 1); /// Ok(()) -///} +/// } /// ``` #[derive(Debug)] pub struct BooleanQuery { diff --git a/src/query/boolean_query/boolean_weight.rs b/src/query/boolean_query/boolean_weight.rs index 8d45c1499..522da9058 100644 --- a/src/query/boolean_query/boolean_weight.rs +++ b/src/query/boolean_query/boolean_weight.rs @@ -1,19 +1,16 @@ +use std::collections::HashMap; + use crate::core::SegmentReader; use crate::postings::FreqReadingOption; use crate::query::explanation::does_not_match; use crate::query::score_combiner::{DoNothingCombiner, ScoreCombiner, SumWithCoordsCombiner}; use crate::query::term_query::TermScorer; use crate::query::weight::{for_each_pruning_scorer, for_each_scorer}; -use crate::query::EmptyScorer; -use crate::query::Exclude; -use crate::query::Occur; -use crate::query::RequiredOptionalScorer; -use crate::query::Scorer; -use crate::query::Union; -use crate::query::Weight; -use crate::query::{intersect_scorers, Explanation}; +use crate::query::{ + intersect_scorers, EmptyScorer, Exclude, Explanation, Occur, RequiredOptionalScorer, Scorer, + Union, Weight, +}; use crate::{DocId, Score}; -use std::collections::HashMap; enum SpecializedScorer { TermUnion(Vec), @@ -21,9 +18,7 @@ enum SpecializedScorer { } fn scorer_union(scorers: Vec>) -> SpecializedScorer -where - TScoreCombiner: ScoreCombiner, -{ +where TScoreCombiner: ScoreCombiner { assert!(!scorers.is_empty()); if scorers.len() == 1 { return SpecializedScorer::Other(scorers.into_iter().next().unwrap()); //< we checked the size beforehands diff --git a/src/query/boolean_query/mod.rs b/src/query/boolean_query/mod.rs index 370404919..a5a1c710b 100644 --- a/src/query/boolean_query/mod.rs +++ b/src/query/boolean_query/mod.rs @@ -2,29 +2,22 @@ mod block_wand; mod boolean_query; mod boolean_weight; -pub(crate) use self::block_wand::block_wand; -pub(crate) use self::block_wand::block_wand_single_scorer; +pub(crate) use self::block_wand::{block_wand, block_wand_single_scorer}; pub use self::boolean_query::BooleanQuery; #[cfg(test)] mod tests { use super::*; - use crate::assert_nearly_equals; use crate::collector::tests::TEST_COLLECTOR_WITH_SCORE; use crate::collector::TopDocs; use crate::query::score_combiner::SumWithCoordsCombiner; use crate::query::term_query::TermScorer; - use crate::query::Intersection; - use crate::query::Occur; - use crate::query::Query; - use crate::query::QueryParser; - use crate::query::RequiredOptionalScorer; - use crate::query::Scorer; - use crate::query::TermQuery; + use crate::query::{ + Intersection, Occur, Query, QueryParser, RequiredOptionalScorer, Scorer, TermQuery, + }; use crate::schema::*; - use crate::Index; - use crate::{DocAddress, DocId, Score}; + use crate::{assert_nearly_equals, DocAddress, DocId, Index, Score}; fn aux_test_helper() -> crate::Result<(Index, Field)> { let mut schema_builder = Schema::builder(); diff --git a/src/query/boost_query.rs b/src/query/boost_query.rs index 81ac29cc9..7b0539889 100644 --- a/src/query/boost_query.rs +++ b/src/query/boost_query.rs @@ -1,9 +1,10 @@ +use std::collections::BTreeMap; +use std::fmt; + use crate::fastfield::AliveBitSet; use crate::query::explanation::does_not_match; use crate::query::{Explanation, Query, Scorer, Weight}; use crate::{DocId, DocSet, Score, Searcher, SegmentReader, Term}; -use std::collections::BTreeMap; -use std::fmt; /// `BoostQuery` is a wrapper over a query used to boost its score. /// diff --git a/src/query/empty_query.rs b/src/query/empty_query.rs index b0fc1f2a2..7a16e6950 100644 --- a/src/query/empty_query.rs +++ b/src/query/empty_query.rs @@ -1,13 +1,8 @@ use super::Scorer; use crate::docset::TERMINATED; use crate::query::explanation::does_not_match; -use crate::query::Weight; -use crate::query::{Explanation, Query}; -use crate::DocId; -use crate::DocSet; -use crate::Score; -use crate::Searcher; -use crate::SegmentReader; +use crate::query::{Explanation, Query, Weight}; +use crate::{DocId, DocSet, Score, Searcher, SegmentReader}; /// `EmptyQuery` is a dummy `Query` in which no document matches. /// diff --git a/src/query/exclude.rs b/src/query/exclude.rs index 295fce777..0b13e66e0 100644 --- a/src/query/exclude.rs +++ b/src/query/exclude.rs @@ -1,7 +1,6 @@ use crate::docset::{DocSet, TERMINATED}; use crate::query::Scorer; -use crate::DocId; -use crate::Score; +use crate::{DocId, Score}; #[inline] fn is_within(docset: &mut TDocSetExclude, doc: DocId) -> bool { diff --git a/src/query/explanation.rs b/src/query/explanation.rs index b065afce4..a4cce46dd 100644 --- a/src/query/explanation.rs +++ b/src/query/explanation.rs @@ -1,7 +1,9 @@ -use crate::{DocId, Score, TantivyError}; -use serde::Serialize; use std::fmt; +use serde::Serialize; + +use crate::{DocId, Score, TantivyError}; + pub(crate) fn does_not_match(doc: DocId) -> TantivyError { TantivyError::InvalidArgument(format!("Document #({}) does not match", doc)) } diff --git a/src/query/fuzzy_query.rs b/src/query/fuzzy_query.rs index f3f9e3c5f..4f3492da9 100644 --- a/src/query/fuzzy_query.rs +++ b/src/query/fuzzy_query.rs @@ -1,12 +1,14 @@ +use std::collections::HashMap; +use std::ops::Range; + +use levenshtein_automata::{Distance, LevenshteinAutomatonBuilder, DFA}; +use once_cell::sync::Lazy; +use tantivy_fst::Automaton; + use crate::query::{AutomatonWeight, Query, Weight}; use crate::schema::Term; use crate::Searcher; use crate::TantivyError::InvalidArgument; -use levenshtein_automata::{Distance, LevenshteinAutomatonBuilder, DFA}; -use once_cell::sync::Lazy; -use std::collections::HashMap; -use std::ops::Range; -use tantivy_fst::Automaton; pub(crate) struct DfaWrapper(pub DFA); @@ -168,13 +170,9 @@ impl Query for FuzzyTermQuery { #[cfg(test)] mod test { use super::FuzzyTermQuery; - use crate::assert_nearly_equals; - use crate::collector::Count; - use crate::collector::TopDocs; - use crate::schema::Schema; - use crate::schema::TEXT; - use crate::Index; - use crate::Term; + use crate::collector::{Count, TopDocs}; + use crate::schema::{Schema, TEXT}; + use crate::{assert_nearly_equals, Index, Term}; #[test] pub fn test_fuzzy_term() -> crate::Result<()> { diff --git a/src/query/intersection.rs b/src/query/intersection.rs index 573ab6e36..e86f0eb1f 100644 --- a/src/query/intersection.rs +++ b/src/query/intersection.rs @@ -1,9 +1,7 @@ use crate::docset::{DocSet, TERMINATED}; use crate::query::term_query::TermScorer; -use crate::query::EmptyScorer; -use crate::query::Scorer; -use crate::DocId; -use crate::Score; +use crate::query::{EmptyScorer, Scorer}; +use crate::{DocId, Score}; /// Returns the intersection scorer. /// diff --git a/src/query/mod.rs b/src/query/mod.rs index c43ccb87d..759bc2e72 100644 --- a/src/query/mod.rs +++ b/src/query/mod.rs @@ -1,4 +1,4 @@ -/*! Query Module */ +//! Query Module mod all_query; mod automaton_weight; @@ -27,16 +27,12 @@ mod weight; mod vec_docset; pub(crate) mod score_combiner; -pub(crate) use self::bm25::Bm25Weight; -pub use self::intersection::Intersection; -pub use self::union::Union; - -#[cfg(test)] -pub use self::vec_docset::VecDocSet; +pub use tantivy_query_grammar::Occur; pub use self::all_query::{AllQuery, AllScorer, AllWeight}; pub use self::automaton_weight::AutomatonWeight; pub use self::bitset::BitSetDocSet; +pub(crate) use self::bm25::Bm25Weight; pub use self::boolean_query::BooleanQuery; pub use self::boost_query::BoostQuery; pub use self::empty_query::{EmptyQuery, EmptyScorer, EmptyWeight}; @@ -45,28 +41,28 @@ pub use self::explanation::Explanation; #[cfg(test)] pub(crate) use self::fuzzy_query::DfaWrapper; pub use self::fuzzy_query::FuzzyTermQuery; -pub use self::intersection::intersect_scorers; +pub use self::intersection::{intersect_scorers, Intersection}; pub use self::more_like_this::{MoreLikeThisQuery, MoreLikeThisQueryBuilder}; pub use self::phrase_query::PhraseQuery; pub use self::query::{Query, QueryClone}; -pub use self::query_parser::QueryParser; -pub use self::query_parser::QueryParserError; +pub use self::query_parser::{QueryParser, QueryParserError}; pub use self::range_query::RangeQuery; pub use self::regex_query::RegexQuery; pub use self::reqopt_scorer::RequiredOptionalScorer; -pub use self::scorer::ConstScorer; -pub use self::scorer::Scorer; +pub use self::scorer::{ConstScorer, Scorer}; pub use self::term_query::TermQuery; +pub use self::union::Union; +#[cfg(test)] +pub use self::vec_docset::VecDocSet; pub use self::weight::Weight; -pub use tantivy_query_grammar::Occur; #[cfg(test)] mod tests { + use std::collections::BTreeMap; + use crate::query::QueryParser; use crate::schema::{Schema, TEXT}; - use crate::Index; - use crate::Term; - use std::collections::BTreeMap; + use crate::{Index, Term}; #[test] fn test_query_terms() { diff --git a/src/query/more_like_this/more_like_this.rs b/src/query/more_like_this/more_like_this.rs index 02d0de819..7c990bc85 100644 --- a/src/query/more_like_this/more_like_this.rs +++ b/src/query/more_like_this/more_like_this.rs @@ -1,12 +1,11 @@ use std::cmp::Reverse; use std::collections::{BinaryHeap, HashMap}; -use crate::{ - query::{bm25::idf, BooleanQuery, BoostQuery, Occur, Query, TermQuery}, - schema::{Field, FieldType, FieldValue, IndexRecordOption, Term, Value}, - tokenizer::{BoxTokenStream, FacetTokenizer, PreTokenizedStream, Tokenizer}, - DocAddress, Result, Searcher, TantivyError, -}; +use crate::query::bm25::idf; +use crate::query::{BooleanQuery, BoostQuery, Occur, Query, TermQuery}; +use crate::schema::{Field, FieldType, IndexRecordOption, Term, Value}; +use crate::tokenizer::{BoxTokenStream, FacetTokenizer, PreTokenizedStream, Tokenizer}; +use crate::{DocAddress, Result, Searcher, TantivyError}; #[derive(Debug, PartialEq)] struct ScoreTerm { @@ -92,7 +91,7 @@ impl MoreLikeThis { pub fn query_with_document_fields( &self, searcher: &Searcher, - doc_fields: &[(Field, Vec)], + doc_fields: &[(Field, Vec)], ) -> Result { let score_terms = self.retrieve_terms_from_doc_fields(searcher, doc_fields)?; let query = self.create_query(score_terms); @@ -126,20 +125,17 @@ impl MoreLikeThis { doc_address: DocAddress, ) -> Result> { let doc = searcher.doc(doc_address)?; - let field_to_field_values = doc + let field_to_values = doc .get_sorted_field_values() .iter() .map(|(field, values)| { ( *field, - values - .iter() - .map(|v| (**v).clone()) - .collect::>(), + values.iter().map(|v| (**v).clone()).collect::>(), ) }) .collect::>(); - self.retrieve_terms_from_doc_fields(searcher, &field_to_field_values) + self.retrieve_terms_from_doc_fields(searcher, &field_to_values) } /// Finds terms for a more-like-this query. @@ -147,15 +143,18 @@ impl MoreLikeThis { fn retrieve_terms_from_doc_fields( &self, searcher: &Searcher, - field_to_field_values: &[(Field, Vec)], + field_to_values: &[(Field, Vec)], ) -> Result> { - if field_to_field_values.is_empty() { - return Err(TantivyError::InvalidArgument("Cannot create more like this query on empty field values. The document may not have stored fields".to_string())); + if field_to_values.is_empty() { + return Err(TantivyError::InvalidArgument( + "Cannot create more like this query on empty field values. The document may not \ + have stored fields" + .to_string(), + )); } - let mut field_to_term_freq_map = HashMap::new(); - for (field, field_values) in field_to_field_values { - self.add_term_frequencies(searcher, *field, field_values, &mut field_to_term_freq_map)?; + for (field, values) in field_to_values { + self.add_term_frequencies(searcher, *field, values, &mut field_to_term_freq_map)?; } self.create_score_term(searcher, field_to_term_freq_map) } @@ -167,7 +166,7 @@ impl MoreLikeThis { &self, searcher: &Searcher, field: Field, - field_values: &[FieldValue], + values: &[Value], term_frequencies: &mut HashMap, ) -> Result<()> { let schema = searcher.schema(); @@ -181,9 +180,9 @@ impl MoreLikeThis { // extract the raw value, possibly tokenizing & filtering to update the term frequency map match field_entry.field_type() { FieldType::Facet(_) => { - let facets: Vec<&str> = field_values + let facets: Vec<&str> = values .iter() - .map(|field_value| match *field_value.value() { + .map(|value| match value { Value::Facet(ref facet) => Ok(facet.encoded_str()), _ => Err(TantivyError::InvalidArgument( "invalid field value".to_string(), @@ -202,8 +201,8 @@ impl MoreLikeThis { FieldType::Str(text_options) => { let mut token_streams: Vec = vec![]; - for field_value in field_values { - match field_value.value() { + for value in values { + match value { Value::PreTokStr(tok_str) => { token_streams.push(PreTokenizedStream::from(tok_str.clone()).into()); } @@ -232,8 +231,8 @@ impl MoreLikeThis { } } FieldType::U64(_) => { - for field_value in field_values { - let val = field_value.value().u64_value().ok_or_else(|| { + for value in values { + let val = value.as_u64().ok_or_else(|| { TantivyError::InvalidArgument("invalid value".to_string()) })?; if !self.is_noise_word(val.to_string()) { @@ -243,11 +242,10 @@ impl MoreLikeThis { } } FieldType::Date(_) => { - for field_value in field_values { + for value in values { // TODO: Ask if this is the semantic (timestamp) we want - let val = field_value - .value() - .date_value() + let val = value + .as_date() .ok_or_else(|| TantivyError::InvalidArgument("invalid value".to_string()))? .timestamp(); if !self.is_noise_word(val.to_string()) { @@ -257,8 +255,8 @@ impl MoreLikeThis { } } FieldType::I64(_) => { - for field_value in field_values { - let val = field_value.value().i64_value().ok_or_else(|| { + for value in values { + let val = value.as_i64().ok_or_else(|| { TantivyError::InvalidArgument("invalid value".to_string()) })?; if !self.is_noise_word(val.to_string()) { @@ -268,8 +266,8 @@ impl MoreLikeThis { } } FieldType::F64(_) => { - for field_value in field_values { - let val = field_value.value().f64_value().ok_or_else(|| { + for value in values { + let val = value.as_f64().ok_or_else(|| { TantivyError::InvalidArgument("invalid value".to_string()) })?; if !self.is_noise_word(val.to_string()) { diff --git a/src/query/more_like_this/query.rs b/src/query/more_like_this/query.rs index 5e2db1816..d8114ff88 100644 --- a/src/query/more_like_this/query.rs +++ b/src/query/more_like_this/query.rs @@ -1,10 +1,7 @@ use super::MoreLikeThis; - -use crate::{ - query::{Query, Weight}, - schema::{Field, FieldValue}, - DocAddress, Result, Searcher, -}; +use crate::query::{Query, Weight}; +use crate::schema::{Field, Value}; +use crate::{DocAddress, Result, Searcher}; /// A query that matches all of the documents similar to a document /// or a set of field values provided. @@ -24,7 +21,6 @@ use crate::{ /// .with_boost_factor(1.0) /// .with_stop_words(vec!["for".to_string()]) /// .with_document(DocAddress::new(2, 1)); -/// /// ``` #[derive(Debug, Clone)] pub struct MoreLikeThisQuery { @@ -35,7 +31,7 @@ pub struct MoreLikeThisQuery { #[derive(Debug, PartialEq, Clone)] enum TargetDocument { DocumentAdress(DocAddress), - DocumentFields(Vec<(Field, Vec)>), + DocumentFields(Vec<(Field, Vec)>), } impl MoreLikeThisQuery { @@ -156,10 +152,7 @@ impl MoreLikeThisQueryBuilder { /// that will be used to compose the resulting query. /// This interface is meant to be used when you want to provide your own set of fields /// not necessarily from a specific document. - pub fn with_document_fields( - self, - doc_fields: Vec<(Field, Vec)>, - ) -> MoreLikeThisQuery { + pub fn with_document_fields(self, doc_fields: Vec<(Field, Vec)>) -> MoreLikeThisQuery { MoreLikeThisQuery { mlt: self.mlt, target: TargetDocument::DocumentFields(doc_fields), @@ -169,12 +162,10 @@ impl MoreLikeThisQueryBuilder { #[cfg(test)] mod tests { - use super::MoreLikeThisQuery; - use super::TargetDocument; + use super::{MoreLikeThisQuery, TargetDocument}; use crate::collector::TopDocs; use crate::schema::{Schema, STORED, TEXT}; - use crate::DocAddress; - use crate::Index; + use crate::{DocAddress, Index}; fn create_test_index() -> crate::Result { let mut schema_builder = Schema::builder(); diff --git a/src/query/phrase_query/mod.rs b/src/query/phrase_query/mod.rs index 40c50b014..767c4e1fa 100644 --- a/src/query/phrase_query/mod.rs +++ b/src/query/phrase_query/mod.rs @@ -10,13 +10,11 @@ pub use self::phrase_weight::PhraseWeight; pub mod tests { use super::*; - use crate::assert_nearly_equals; use crate::collector::tests::{TEST_COLLECTOR_WITHOUT_SCORE, TEST_COLLECTOR_WITH_SCORE}; use crate::core::Index; use crate::query::Weight; use crate::schema::{Schema, Term, TEXT}; - use crate::DocId; - use crate::{DocAddress, TERMINATED}; + use crate::{assert_nearly_equals, DocAddress, DocId, TERMINATED}; pub fn create_index(texts: &[&'static str]) -> crate::Result { let mut schema_builder = Schema::builder(); @@ -124,9 +122,7 @@ pub mod tests { #[test] pub fn test_phrase_query_no_positions() -> crate::Result<()> { let mut schema_builder = Schema::builder(); - use crate::schema::IndexRecordOption; - use crate::schema::TextFieldIndexing; - use crate::schema::TextOptions; + use crate::schema::{IndexRecordOption, TextFieldIndexing, TextOptions}; let no_positions = TextOptions::default().set_indexing_options( TextFieldIndexing::default() .set_tokenizer("default") diff --git a/src/query/phrase_query/phrase_query.rs b/src/query/phrase_query/phrase_query.rs index 4580a9f1f..34af3e571 100644 --- a/src/query/phrase_query/phrase_query.rs +++ b/src/query/phrase_query/phrase_query.rs @@ -3,10 +3,8 @@ use std::collections::BTreeMap; use super::PhraseWeight; use crate::core::searcher::Searcher; use crate::query::bm25::Bm25Weight; -use crate::query::Query; -use crate::query::Weight; -use crate::schema::IndexRecordOption; -use crate::schema::{Field, Term}; +use crate::query::{Query, Weight}; +use crate::schema::{Field, IndexRecordOption, Term}; /// `PhraseQuery` matches a specific sequence of words. /// @@ -21,7 +19,6 @@ use crate::schema::{Field, Term}; /// /// Using a `PhraseQuery` on a field requires positions /// to be indexed for this field. -/// #[derive(Clone, Debug)] pub struct PhraseQuery { field: Field, diff --git a/src/query/phrase_query/phrase_scorer.rs b/src/query/phrase_query/phrase_scorer.rs index 1b7d60265..e0eeec294 100644 --- a/src/query/phrase_query/phrase_scorer.rs +++ b/src/query/phrase_query/phrase_scorer.rs @@ -1,10 +1,11 @@ +use std::cmp::Ordering; + use crate::docset::{DocSet, TERMINATED}; use crate::fieldnorm::FieldNormReader; use crate::postings::Postings; use crate::query::bm25::Bm25Weight; use crate::query::{Intersection, Scorer}; use crate::{DocId, Score}; -use std::cmp::Ordering; struct PostingsWithOffset { offset: u32, @@ -295,9 +296,10 @@ mod tests { #[cfg(all(test, feature = "unstable"))] mod bench { - use super::{intersection, intersection_count}; use test::Bencher; + use super::{intersection, intersection_count}; + #[bench] fn bench_intersection_short(b: &mut Bencher) { b.iter(|| { diff --git a/src/query/phrase_query/phrase_weight.rs b/src/query/phrase_query/phrase_weight.rs index 0270b5dc4..9c08c3557 100644 --- a/src/query/phrase_query/phrase_weight.rs +++ b/src/query/phrase_query/phrase_weight.rs @@ -4,13 +4,9 @@ use crate::fieldnorm::FieldNormReader; use crate::postings::SegmentPostings; use crate::query::bm25::Bm25Weight; use crate::query::explanation::does_not_match; -use crate::query::Scorer; -use crate::query::Weight; -use crate::query::{EmptyScorer, Explanation}; -use crate::schema::IndexRecordOption; -use crate::schema::Term; -use crate::Score; -use crate::{DocId, DocSet}; +use crate::query::{EmptyScorer, Explanation, Scorer, Weight}; +use crate::schema::{IndexRecordOption, Term}; +use crate::{DocId, DocSet, Score}; pub struct PhraseWeight { phrase_terms: Vec<(usize, Term)>, diff --git a/src/query/query.rs b/src/query/query.rs index 03451be75..05dbf7242 100644 --- a/src/query/query.rs +++ b/src/query/query.rs @@ -1,11 +1,12 @@ +use std::collections::BTreeMap; +use std::fmt; + +use downcast_rs::impl_downcast; + use super::Weight; use crate::core::searcher::Searcher; use crate::query::Explanation; -use crate::DocAddress; -use crate::Term; -use downcast_rs::impl_downcast; -use std::collections::BTreeMap; -use std::fmt; +use crate::{DocAddress, Term}; /// The `Query` trait defines a set of documents and a scoring method /// for those documents. @@ -81,8 +82,7 @@ pub trait QueryClone { } impl QueryClone for T -where - T: 'static + Query + Clone, +where T: 'static + Query + Clone { fn box_clone(&self) -> Box { Box::new(self.clone()) diff --git a/src/query/query_parser/logical_ast.rs b/src/query/query_parser/logical_ast.rs index e443c29f9..9d26c3cd6 100644 --- a/src/query/query_parser/logical_ast.rs +++ b/src/query/query_parser/logical_ast.rs @@ -1,11 +1,10 @@ -use crate::query::Occur; -use crate::schema::Field; -use crate::schema::Term; -use crate::schema::Type; -use crate::Score; use std::fmt; use std::ops::Bound; +use crate::query::Occur; +use crate::schema::{Field, Term, Type}; +use crate::Score; + #[derive(Clone)] pub enum LogicalLiteral { Term(Term), diff --git a/src/query/query_parser/mod.rs b/src/query/query_parser/mod.rs index ee7a5b4aa..66c0d3d26 100644 --- a/src/query/query_parser/mod.rs +++ b/src/query/query_parser/mod.rs @@ -1,5 +1,4 @@ mod query_parser; pub mod logical_ast; -pub use self::query_parser::QueryParser; -pub use self::query_parser::QueryParserError; +pub use self::query_parser::{QueryParser, QueryParserError}; diff --git a/src/query/query_parser/query_parser.rs b/src/query/query_parser/query_parser.rs index 96839b99c..2cfcec7f5 100644 --- a/src/query/query_parser/query_parser.rs +++ b/src/query/query_parser/query_parser.rs @@ -1,25 +1,21 @@ -use super::logical_ast::*; -use crate::core::Index; -use crate::query::BooleanQuery; -use crate::query::EmptyQuery; -use crate::query::Occur; -use crate::query::PhraseQuery; -use crate::query::Query; -use crate::query::RangeQuery; -use crate::query::TermQuery; -use crate::query::{AllQuery, BoostQuery}; -use crate::schema::{Facet, FacetParseError, IndexRecordOption}; -use crate::schema::{Field, Schema}; -use crate::schema::{FieldType, Term}; -use crate::tokenizer::TokenizerManager; -use crate::Score; use std::borrow::Cow; use std::collections::HashMap; use std::num::{ParseFloatError, ParseIntError}; use std::ops::Bound; use std::str::FromStr; + use tantivy_query_grammar::{UserInputAst, UserInputBound, UserInputLeaf}; +use super::logical_ast::*; +use crate::core::Index; +use crate::query::{ + AllQuery, BooleanQuery, BoostQuery, EmptyQuery, Occur, PhraseQuery, Query, RangeQuery, + TermQuery, +}; +use crate::schema::{Facet, FacetParseError, Field, FieldType, IndexRecordOption, Schema, Term}; +use crate::tokenizer::TokenizerManager; +use crate::Score; + /// Possible error that may happen when parsing a query. #[derive(Debug, PartialEq, Eq, Error)] pub enum QueryParserError { @@ -123,10 +119,9 @@ fn trim_ast(logical_ast: LogicalAst) -> Option { /// /// The language covered by the current parser is extremely simple. /// -/// * simple terms: "e.g.: `Barack Obama` are simply tokenized using -/// tantivy's [`SimpleTokenizer`](../tokenizer/struct.SimpleTokenizer.html), hence -/// becoming `["barack", "obama"]`. The terms are then searched within -/// the default terms of the query parser. +/// * simple terms: "e.g.: `Barack Obama` are simply tokenized using tantivy's +/// [`SimpleTokenizer`](../tokenizer/struct.SimpleTokenizer.html), hence becoming `["barack", +/// "obama"]`. The terms are then searched within the default terms of the query parser. /// /// e.g. If `body` and `title` are default fields, our example terms are /// `["title:barack", "body:barack", "title:obama", "body:obama"]`. @@ -143,33 +138,35 @@ fn trim_ast(logical_ast: LogicalAst) -> Option { /// Switching to a default of `AND` can be done by calling `.set_conjunction_by_default()`. /// /// -/// * boolean operators `AND`, `OR`. `AND` takes precedence over `OR`, so that `a AND b OR c` is interpreted +/// * boolean operators `AND`, `OR`. `AND` takes precedence over `OR`, so that `a AND b OR c` is +/// interpreted /// as `(a AND b) OR c`. /// -/// * In addition to the boolean operators, the `-`, `+` can help define. These operators -/// are sufficient to express all queries using boolean operators. For instance `x AND y OR z` can -/// be written (`(+x +y) z`). In addition, these operators can help define "required optional" -/// queries. `(+x y)` matches the same document set as simply `x`, but `y` will help refining the score. +/// * In addition to the boolean operators, the `-`, `+` can help define. These operators are +/// sufficient to express all queries using boolean operators. For instance `x AND y OR z` can be +/// written (`(+x +y) z`). In addition, these operators can help define "required optional" +/// queries. `(+x y)` matches the same document set as simply `x`, but `y` will help refining the +/// score. /// -/// * negative terms: By prepending a term by a `-`, a term can be excluded -/// from the search. This is useful for disambiguating a query. -/// e.g. `apple -fruit` +/// * negative terms: By prepending a term by a `-`, a term can be excluded from the search. This is +/// useful for disambiguating a query. e.g. `apple -fruit` /// /// * must terms: By prepending a term by a `+`, a term can be made required for the search. /// -/// * phrase terms: Quoted terms become phrase searches on fields that have positions indexed. -/// e.g., `title:"Barack Obama"` will only find documents that have "barack" immediately followed -/// by "obama". +/// * phrase terms: Quoted terms become phrase searches on fields that have positions indexed. e.g., +/// `title:"Barack Obama"` will only find documents that have "barack" immediately followed by +/// "obama". /// /// * range terms: Range searches can be done by specifying the start and end bound. These can be -/// inclusive or exclusive. e.g., `title:[a TO c}` will find all documents whose title contains -/// a word lexicographically between `a` and `c` (inclusive lower bound, exclusive upper bound). +/// inclusive or exclusive. e.g., `title:[a TO c}` will find all documents whose title contains a +/// word lexicographically between `a` and `c` (inclusive lower bound, exclusive upper bound). /// Inclusive bounds are `[]`, exclusive are `{}`. /// -/// * date values: The query parser supports rfc3339 formatted dates. For example `"2002-10-02T15:00:00.05Z"` -/// or `some_date_field:[2002-10-02T15:00:00Z TO 2002-10-02T18:00:00Z}` +/// * date values: The query parser supports rfc3339 formatted dates. For example +/// `"2002-10-02T15:00:00.05Z"` or `some_date_field:[2002-10-02T15:00:00Z TO +/// 2002-10-02T18:00:00Z}` /// -/// * all docs query: A plain `*` will match all documents in the index. +/// * all docs query: A plain `*` will match all documents in the index. /// /// Parts of the queries can be boosted by appending `^boostfactor`. /// For instance, `"SRE"^2.0 OR devops^0.4` will boost documents containing `SRE` instead of @@ -200,8 +197,7 @@ fn all_negative(ast: &LogicalAst) -> bool { impl QueryParser { /// Creates a `QueryParser`, given /// * schema - index Schema - /// * default_fields - fields used to search if no field is specifically defined - /// in the query. + /// * default_fields - fields used to search if no field is specifically defined in the query. pub fn new( schema: Schema, default_fields: Vec, @@ -583,19 +579,19 @@ fn convert_to_query(logical_ast: LogicalAst) -> Box { #[cfg(test)] mod test { + use matches::assert_matches; + use super::super::logical_ast::*; - use super::QueryParser; - use super::QueryParserError; + use super::{QueryParser, QueryParserError}; use crate::query::Query; - use crate::schema::FacetOptions; - use crate::schema::Field; - use crate::schema::{IndexRecordOption, TextFieldIndexing, TextOptions}; - use crate::schema::{Schema, Term, INDEXED, STORED, STRING, TEXT}; + use crate::schema::{ + FacetOptions, Field, IndexRecordOption, Schema, Term, TextFieldIndexing, TextOptions, + INDEXED, STORED, STRING, TEXT, + }; use crate::tokenizer::{ LowerCaser, SimpleTokenizer, StopWordFilter, TextAnalyzer, TokenizerManager, }; use crate::Index; - use matches::assert_matches; fn make_schema() -> Schema { let mut schema_builder = Schema::builder(); @@ -691,7 +687,8 @@ mod test { let query = query_parser.parse_query("title:[A TO B]").unwrap(); assert_eq!( format!("{:?}", query), - "Boost(query=RangeQuery { field: Field(0), value_type: Str, left_bound: Included([97]), right_bound: Included([98]) }, boost=2)" + "Boost(query=RangeQuery { field: Field(0), value_type: Str, left_bound: \ + Included([97]), right_bound: Included([98]) }, boost=2)" ); } diff --git a/src/query/range_query.rs b/src/query/range_query.rs index c2dee2524..700da7128 100644 --- a/src/query/range_query.rs +++ b/src/query/range_query.rs @@ -1,18 +1,16 @@ -use crate::core::Searcher; -use crate::core::SegmentReader; -use crate::error::TantivyError; -use crate::query::explanation::does_not_match; -use crate::query::ConstScorer; -use crate::query::{BitSetDocSet, Explanation}; -use crate::query::{Query, Scorer, Weight}; -use crate::schema::Type; -use crate::schema::{Field, IndexRecordOption, Term}; -use crate::termdict::{TermDictionary, TermStreamer}; -use crate::{DocId, Score}; -use common::BitSet; use std::io; use std::ops::{Bound, Range}; +use common::BitSet; + +use crate::core::{Searcher, SegmentReader}; +use crate::error::TantivyError; +use crate::query::explanation::does_not_match; +use crate::query::{BitSetDocSet, ConstScorer, Explanation, Query, Scorer, Weight}; +use crate::schema::{Field, IndexRecordOption, Term, Type}; +use crate::termdict::{TermDictionary, TermStreamer}; +use crate::{DocId, Score}; + fn map_bound TTo>( bound: &Bound, transform: &Transform, @@ -330,12 +328,13 @@ impl Weight for RangeWeight { #[cfg(test)] mod tests { + use std::ops::Bound; + use super::RangeQuery; use crate::collector::{Count, TopDocs}; use crate::query::QueryParser; use crate::schema::{Document, Field, Schema, INDEXED, TEXT}; use crate::Index; - use std::ops::Bound; #[test] fn test_range_query_simple() -> crate::Result<()> { diff --git a/src/query/regex_query.rs b/src/query/regex_query.rs index 013da9177..24dbcdab7 100644 --- a/src/query/regex_query.rs +++ b/src/query/regex_query.rs @@ -1,10 +1,12 @@ +use std::clone::Clone; +use std::sync::Arc; + +use tantivy_fst::Regex; + use crate::error::TantivyError; use crate::query::{AutomatonWeight, Query, Weight}; use crate::schema::Field; use crate::Searcher; -use std::clone::Clone; -use std::sync::Arc; -use tantivy_fst::Regex; /// A Regex Query matches all of the documents /// containing a specific term that matches @@ -91,15 +93,15 @@ impl Query for RegexQuery { #[cfg(test)] mod test { - use super::RegexQuery; - use crate::assert_nearly_equals; - use crate::collector::TopDocs; - use crate::schema::TEXT; - use crate::schema::{Field, Schema}; - use crate::{Index, IndexReader}; use std::sync::Arc; + use tantivy_fst::Regex; + use super::RegexQuery; + use crate::collector::TopDocs; + use crate::schema::{Field, Schema, TEXT}; + use crate::{assert_nearly_equals, Index, IndexReader}; + fn build_test_index() -> crate::Result<(IndexReader, Field)> { let mut schema_builder = Schema::builder(); let country_field = schema_builder.add_text_field("country", TEXT); diff --git a/src/query/reqopt_scorer.rs b/src/query/reqopt_scorer.rs index a3d39a928..427a443b1 100644 --- a/src/query/reqopt_scorer.rs +++ b/src/query/reqopt_scorer.rs @@ -1,9 +1,9 @@ +use std::marker::PhantomData; + use crate::docset::DocSet; use crate::query::score_combiner::ScoreCombiner; use crate::query::Scorer; -use crate::DocId; -use crate::Score; -use std::marker::PhantomData; +use crate::{DocId, Score}; /// Given a required scorer and an optional scorer /// matches all document from the required scorer @@ -89,9 +89,7 @@ mod tests { use crate::docset::{DocSet, TERMINATED}; use crate::postings::tests::test_skip_against_unoptimized; use crate::query::score_combiner::{DoNothingCombiner, SumCombiner}; - use crate::query::ConstScorer; - use crate::query::Scorer; - use crate::query::VecDocSet; + use crate::query::{ConstScorer, Scorer, VecDocSet}; use crate::tests::sample_with_seed; #[test] diff --git a/src/query/score_combiner.rs b/src/query/score_combiner.rs index 7af0e876a..5fd68af45 100644 --- a/src/query/score_combiner.rs +++ b/src/query/score_combiner.rs @@ -21,7 +21,6 @@ pub trait ScoreCombiner: Default + Clone + Send + Copy + 'static { /// even call the scorers `.score()` function. /// /// It is useful to optimize the case when scoring is disabled. -/// #[derive(Default, Clone, Copy)] //< these should not be too much work :) pub struct DoNothingCombiner; diff --git a/src/query/scorer.rs b/src/query/scorer.rs index 4fa2b95df..1dcba188a 100644 --- a/src/query/scorer.rs +++ b/src/query/scorer.rs @@ -1,9 +1,10 @@ -use crate::docset::DocSet; -use crate::DocId; -use crate::Score; -use downcast_rs::impl_downcast; use std::ops::DerefMut; +use downcast_rs::impl_downcast; + +use crate::docset::DocSet; +use crate::{DocId, Score}; + /// Scored set of documents matching a query within a specific segment. /// /// See [`Query`](./trait.Query.html). diff --git a/src/query/term_query/mod.rs b/src/query/term_query/mod.rs index 81cc0f0aa..d519f8b30 100644 --- a/src/query/term_query/mod.rs +++ b/src/query/term_query/mod.rs @@ -14,8 +14,7 @@ mod tests { use crate::postings::compression::COMPRESSION_BLOCK_SIZE; use crate::query::{Query, QueryParser, Scorer, TermQuery}; use crate::schema::{Field, IndexRecordOption, Schema, STRING, TEXT}; - use crate::{assert_nearly_equals, DocAddress}; - use crate::{Index, Term, TERMINATED}; + use crate::{assert_nearly_equals, DocAddress, Index, Term, TERMINATED}; #[test] pub fn test_term_query_no_freq() -> crate::Result<()> { diff --git a/src/query/term_query/term_query.rs b/src/query/term_query/term_query.rs index 44f3e0d72..1c2202139 100644 --- a/src/query/term_query/term_query.rs +++ b/src/query/term_query/term_query.rs @@ -1,13 +1,12 @@ -use super::term_weight::TermWeight; -use crate::query::bm25::Bm25Weight; -use crate::query::Weight; -use crate::query::{Explanation, Query}; -use crate::schema::IndexRecordOption; -use crate::Searcher; -use crate::Term; use std::collections::BTreeMap; use std::fmt; +use super::term_weight::TermWeight; +use crate::query::bm25::Bm25Weight; +use crate::query::{Explanation, Query, Weight}; +use crate::schema::IndexRecordOption; +use crate::{Searcher, Term}; + /// A Term query matches all of the documents /// containing a specific term. /// diff --git a/src/query/term_query/term_scorer.rs b/src/query/term_query/term_scorer.rs index 28607ec04..5badb55cf 100644 --- a/src/query/term_query/term_scorer.rs +++ b/src/query/term_query/term_scorer.rs @@ -1,12 +1,9 @@ use crate::docset::DocSet; -use crate::query::{Explanation, Scorer}; -use crate::DocId; -use crate::Score; - use crate::fieldnorm::FieldNormReader; -use crate::postings::SegmentPostings; -use crate::postings::{FreqReadingOption, Postings}; +use crate::postings::{FreqReadingOption, Postings, SegmentPostings}; use crate::query::bm25::Bm25Weight; +use crate::query::{Explanation, Scorer}; +use crate::{DocId, Score}; #[derive(Clone)] pub struct TermScorer { @@ -128,16 +125,17 @@ impl Scorer for TermScorer { #[cfg(test)] mod tests { + use futures::executor::block_on; + use proptest::prelude::*; + use crate::merge_policy::NoMergePolicy; use crate::postings::compression::COMPRESSION_BLOCK_SIZE; use crate::query::term_query::TermScorer; use crate::query::{Bm25Weight, Scorer, TermQuery}; use crate::schema::{IndexRecordOption, Schema, TEXT}; - use crate::Score; - use crate::{assert_nearly_equals, Index, Searcher, SegmentId, Term}; - use crate::{DocId, DocSet, TERMINATED}; - use futures::executor::block_on; - use proptest::prelude::*; + use crate::{ + assert_nearly_equals, DocId, DocSet, Index, Score, Searcher, SegmentId, Term, TERMINATED, + }; #[test] fn test_term_scorer_max_score() -> crate::Result<()> { diff --git a/src/query/term_query/term_weight.rs b/src/query/term_query/term_weight.rs index 46a0dc392..9824487e9 100644 --- a/src/query/term_query/term_weight.rs +++ b/src/query/term_query/term_weight.rs @@ -6,11 +6,9 @@ use crate::postings::SegmentPostings; use crate::query::bm25::Bm25Weight; use crate::query::explanation::does_not_match; use crate::query::weight::for_each_scorer; -use crate::query::Weight; -use crate::query::{Explanation, Scorer}; +use crate::query::{Explanation, Scorer, Weight}; use crate::schema::IndexRecordOption; -use crate::Term; -use crate::{DocId, Score}; +use crate::{DocId, Score, Term}; pub struct TermWeight { term: Term, diff --git a/src/query/union.rs b/src/query/union.rs index cf7b4d956..aa6ee75e8 100644 --- a/src/query/union.rs +++ b/src/query/union.rs @@ -1,9 +1,9 @@ +use common::TinySet; + use crate::docset::{DocSet, TERMINATED}; use crate::query::score_combiner::{DoNothingCombiner, ScoreCombiner}; use crate::query::Scorer; -use crate::DocId; -use crate::Score; -use common::TinySet; +use crate::{DocId, Score}; const HORIZON_NUM_TINYBITSETS: usize = 64; const HORIZON: u32 = 64u32 * HORIZON_NUM_TINYBITSETS as u32; @@ -14,9 +14,7 @@ const HORIZON: u32 = 64u32 * HORIZON_NUM_TINYBITSETS as u32; // // Also, it does not "yield" any elements. fn unordered_drain_filter(v: &mut Vec, mut predicate: P) -where - P: FnMut(&mut T) -> bool, -{ +where P: FnMut(&mut T) -> bool { let mut i = 0; while i < v.len() { if predicate(&mut v[i]) { @@ -249,16 +247,14 @@ where #[cfg(test)] mod tests { - use super::Union; - use super::HORIZON; + use std::collections::BTreeSet; + + use super::{Union, HORIZON}; use crate::docset::{DocSet, TERMINATED}; use crate::postings::tests::test_skip_against_unoptimized; use crate::query::score_combiner::DoNothingCombiner; - use crate::query::ConstScorer; - use crate::query::VecDocSet; - use crate::tests; - use crate::DocId; - use std::collections::BTreeSet; + use crate::query::{ConstScorer, VecDocSet}; + use crate::{tests, DocId}; fn aux_test_union(vals: Vec>) { let mut val_set: BTreeSet = BTreeSet::new(); @@ -396,12 +392,11 @@ mod tests { #[cfg(all(test, feature = "unstable"))] mod bench { + use test::Bencher; + use crate::query::score_combiner::DoNothingCombiner; use crate::query::{ConstScorer, Union, VecDocSet}; - use crate::DocId; - use crate::DocSet; - use crate::{tests, TERMINATED}; - use test::Bencher; + use crate::{tests, DocId, DocSet, TERMINATED}; #[bench] fn bench_union_3_high(bench: &mut Bencher) { diff --git a/src/query/vec_docset.rs b/src/query/vec_docset.rs index 3f765ef58..7b2b08846 100644 --- a/src/query/vec_docset.rs +++ b/src/query/vec_docset.rs @@ -1,8 +1,9 @@ #![allow(dead_code)] +use common::HasLen; + use crate::docset::{DocSet, TERMINATED}; use crate::DocId; -use common::HasLen; /// Simulate a `Postings` objects from a `VecPostings`. /// `VecPostings` only exist for testing purposes. diff --git a/src/reader/mod.rs b/src/reader/mod.rs index df0ee21ef..53f82c88b 100644 --- a/src/reader/mod.rs +++ b/src/reader/mod.rs @@ -1,23 +1,19 @@ mod pool; mod warming; +use std::convert::TryInto; +use std::io; +use std::sync::atomic::AtomicU64; +use std::sync::{atomic, Arc, Weak}; + +pub use warming::Warmer; + pub use self::pool::LeasedItem; use self::pool::Pool; use self::warming::WarmingState; use crate::core::searcher::SearcherGeneration; -use crate::directory::WatchHandle; -use crate::directory::META_LOCK; -use crate::directory::{Directory, WatchCallback}; -use crate::Index; -use crate::Searcher; -use crate::SegmentReader; -use crate::{Inventory, TrackedObject}; -use std::sync::atomic; -use std::sync::atomic::AtomicU64; -use std::sync::Arc; -use std::sync::Weak; -use std::{convert::TryInto, io}; -pub use warming::Warmer; +use crate::directory::{Directory, WatchCallback, WatchHandle, META_LOCK}; +use crate::{Index, Inventory, Searcher, SegmentReader, TrackedObject}; /// Defines when a new version of the index should be reloaded. /// @@ -29,7 +25,8 @@ pub enum ReloadPolicy { /// The index is entirely reloaded manually. /// All updates of the index should be manual. /// - /// No change is reflected automatically. You are required to call `IndexReader::reload()` manually. + /// No change is reflected automatically. You are required to call `IndexReader::reload()` + /// manually. Manual, /// The index is reloaded within milliseconds after a new commit is available. /// This is made possible by watching changes in the `meta.json` file. @@ -139,7 +136,8 @@ impl IndexReaderBuilder { /// Sets the number of warming threads. /// - /// This allows parallelizing warming work when there are multiple [Warmer] registered with the [IndexReader]. + /// This allows parallelizing warming work when there are multiple [Warmer] registered with the + /// [IndexReader]. pub fn num_warming_threads(mut self, num_warming_threads: usize) -> IndexReaderBuilder { self.num_warming_threads = num_warming_threads; self diff --git a/src/reader/pool.rs b/src/reader/pool.rs index ef70dcf8c..a6e002a1f 100644 --- a/src/reader/pool.rs +++ b/src/reader/pool.rs @@ -1,10 +1,9 @@ -use crossbeam::channel::unbounded; -use crossbeam::channel::{Receiver, RecvError, Sender}; use std::ops::{Deref, DerefMut}; -use std::sync::atomic::AtomicUsize; -use std::sync::atomic::Ordering; +use std::sync::atomic::{AtomicUsize, Ordering}; use std::sync::Arc; +use crossbeam::channel::{unbounded, Receiver, RecvError, Sender}; + pub struct GenerationItem { generation: usize, item: T, @@ -196,11 +195,12 @@ impl Drop for LeasedItem { #[cfg(test)] mod tests { - use super::Pool; - use super::Queue; - use crossbeam::channel; use std::{iter, mem}; + use crossbeam::channel; + + use super::{Pool, Queue}; + #[test] fn test_pool() { let items10: Vec = iter::repeat(10).take(10).collect(); diff --git a/src/reader/warming.rs b/src/reader/warming.rs index 5e6b4e329..ad2681622 100644 --- a/src/reader/warming.rs +++ b/src/reader/warming.rs @@ -1,13 +1,10 @@ -use std::{ - collections::HashSet, - ops::Deref, - sync::{Arc, Mutex, Weak}, - thread::JoinHandle, - time::Duration, -}; +use std::collections::HashSet; +use std::ops::Deref; +use std::sync::{Arc, Mutex, Weak}; +use std::thread::JoinHandle; +use std::time::Duration; -use crate::Inventory; -use crate::{Executor, Searcher, SearcherGeneration, TantivyError}; +use crate::{Executor, Inventory, Searcher, SearcherGeneration, TantivyError}; pub const GC_INTERVAL: Duration = Duration::from_secs(1); @@ -41,9 +38,11 @@ impl WarmingState { })))) } - /// Start tracking a new generation of [Searcher], and [Warmer::warm] it if there are active warmers. + /// Start tracking a new generation of [Searcher], and [Warmer::warm] it if there are active + /// warmers. /// - /// A background GC thread for [Warmer::garbage_collect] calls is uniquely created if there are active warmers. + /// A background GC thread for [Warmer::garbage_collect] calls is uniquely created if there are + /// active warmers. pub fn warm_new_searcher_generation(&self, searcher: &Searcher) -> crate::Result<()> { self.0 .lock() @@ -70,8 +69,9 @@ struct WarmingStateInner { impl WarmingStateInner { /// Start tracking provided searcher as an exemplar of a new generation. - /// If there are active warmers, warm them with the provided searcher, and kick background GC thread if it has not yet been kicked. - /// Otherwise, prune state for dropped searcher generations inline. + /// If there are active warmers, warm them with the provided searcher, and kick background GC + /// thread if it has not yet been kicked. Otherwise, prune state for dropped searcher + /// generations inline. fn warm_new_searcher_generation( &mut self, searcher: &Searcher, @@ -102,7 +102,8 @@ impl WarmingStateInner { strong_warmers } - /// [Warmer::garbage_collect] active warmers if some searcher generation is observed to have been dropped. + /// [Warmer::garbage_collect] active warmers if some searcher generation is observed to have + /// been dropped. fn gc_maybe(&mut self) -> bool { let live_generations = self.searcher_generation_inventory.list(); let live_generation_ids: HashSet = live_generations @@ -143,7 +144,8 @@ impl WarmingStateInner { Ok(true) } - /// Every [GC_INTERVAL] attempt to GC, with panics caught and logged using [std::panic::catch_unwind]. + /// Every [GC_INTERVAL] attempt to GC, with panics caught and logged using + /// [std::panic::catch_unwind]. fn gc_loop(inner: Weak>) { for _ in crossbeam::channel::tick(GC_INTERVAL) { if let Some(inner) = inner.upgrade() { @@ -170,22 +172,15 @@ fn warming_executor(num_threads: usize) -> crate::Result { #[cfg(test)] mod tests { - use std::{ - collections::HashSet, - sync::{ - atomic::{self, AtomicUsize}, - Arc, RwLock, Weak, - }, - }; - - use crate::{ - core::searcher::SearcherGeneration, - directory::RamDirectory, - schema::{Schema, INDEXED}, - Index, IndexSettings, ReloadPolicy, Searcher, SegmentId, - }; + use std::collections::HashSet; + use std::sync::atomic::{self, AtomicUsize}; + use std::sync::{Arc, RwLock, Weak}; use super::Warmer; + use crate::core::searcher::SearcherGeneration; + use crate::directory::RamDirectory; + use crate::schema::{Schema, INDEXED}; + use crate::{Index, IndexSettings, ReloadPolicy, Searcher, SegmentId}; #[derive(Default)] struct TestWarmer { diff --git a/src/schema/bytes_options.rs b/src/schema/bytes_options.rs index 662faf839..2adbe659a 100644 --- a/src/schema/bytes_options.rs +++ b/src/schema/bytes_options.rs @@ -1,6 +1,7 @@ -use serde::{Deserialize, Serialize}; use std::ops::BitOr; +use serde::{Deserialize, Serialize}; + use super::flags::{FastFlag, IndexedFlag, SchemaFlagList, StoredFlag}; /// Define how an a bytes field should be handled by tantivy. #[derive(Clone, Debug, Default, PartialEq, Eq, Serialize, Deserialize)] diff --git a/src/schema/document.rs b/src/schema/document.rs index ecf9eec38..e7e41225a 100644 --- a/src/schema/document.rs +++ b/src/schema/document.rs @@ -1,20 +1,18 @@ +use std::collections::{HashMap, HashSet}; +use std::io::{self, Read, Write}; +use std::mem; + +use common::{BinarySerializable, VInt}; + use super::*; use crate::tokenizer::PreTokenizedString; use crate::DateTime; -use common::BinarySerializable; -use common::VInt; -use std::collections::HashMap; -use std::collections::HashSet; -use std::io::{self, Read, Write}; -use std::mem; /// Tantivy's Document is the object that can /// be indexed and then searched for. /// /// Documents are fundamentally a collection of unordered couple `(field, value)`. /// In this list, one field may appear more than once. -/// -/// /// Documents are really just a list of couple `(field, value)`. /// In this list, one field may appear more than once. @@ -78,9 +76,7 @@ impl Document { /// Adding a facet to the document. pub fn add_facet(&mut self, field: Field, path: F) - where - Facet: From, - { + where Facet: From { let facet = Facet::from(path); let value = Value::Facet(facet); self.add(FieldValue::new(field, value)); @@ -140,7 +136,7 @@ impl Document { /// /// The result of this method is not cached and is /// computed on the fly when this method is called. - pub fn get_sorted_field_values(&self) -> Vec<(Field, Vec<&FieldValue>)> { + pub fn get_sorted_field_values(&self) -> Vec<(Field, Vec<&Value>)> { let mut field_values: Vec<&FieldValue> = self.field_values().iter().collect(); field_values.sort_by_key(|field_value| field_value.field()); @@ -154,15 +150,15 @@ impl Document { let mut grouped_field_values = vec![]; let mut current_field = first_field_value.field(); - let mut current_group = vec![first_field_value]; + let mut current_group = vec![first_field_value.value()]; for field_value in field_values_it { if field_value.field() == current_field { - current_group.push(field_value); + current_group.push(field_value.value()); } else { grouped_field_values.push(( current_field, - mem::replace(&mut current_group, vec![field_value]), + mem::replace(&mut current_group, vec![field_value.value()]), )); current_field = field_value.field(); } diff --git a/src/schema/facet.rs b/src/schema/facet.rs index 8cbb3b020..658bed748 100644 --- a/src/schema/facet.rs +++ b/src/schema/facet.rs @@ -1,14 +1,14 @@ -use common::BinarySerializable; -use once_cell::sync::Lazy; -use regex::Regex; -use serde::{Deserialize, Deserializer, Serialize, Serializer}; -use std::borrow::Borrow; -use std::borrow::Cow; +use std::borrow::{Borrow, Cow}; use std::fmt::{self, Debug, Display, Formatter}; use std::io::{self, Read, Write}; use std::str; use std::string::FromUtf8Error; +use common::BinarySerializable; +use once_cell::sync::Lazy; +use regex::Regex; +use serde::{Deserialize, Deserializer, Serialize, Serializer}; + const SLASH_BYTE: u8 = b'/'; const ESCAPE_BYTE: u8 = b'\\'; @@ -74,7 +74,7 @@ impl Facet { /// Creates a `Facet` from its binary representation. pub fn from_encoded(encoded_bytes: Vec) -> Result { // facet bytes validation. `0u8` is used a separator but that is still legal utf-8 - //Ok(Facet(String::from_utf8(encoded_bytes)?)) + // Ok(Facet(String::from_utf8(encoded_bytes)?)) String::from_utf8(encoded_bytes).map(Facet) } @@ -84,9 +84,7 @@ impl Facet { /// contains a `/` or a `\`, it should be escaped /// using an anti-slash `/`. pub fn from_text(path: &T) -> Result - where - T: ?Sized + AsRef, - { + where T: ?Sized + AsRef { #[derive(Copy, Clone)] enum State { Escaped, @@ -212,18 +210,14 @@ fn escape_slashes(s: &str) -> Cow<'_, str> { impl Serialize for Facet { fn serialize(&self, serializer: S) -> Result - where - S: Serializer, - { + where S: Serializer { serializer.serialize_str(&self.to_string()) } } impl<'de> Deserialize<'de> for Facet { fn deserialize(deserializer: D) -> Result - where - D: Deserializer<'de>, - { + where D: Deserializer<'de> { <&'de str as Deserialize<'de>>::deserialize(deserializer).map(Facet::from) } } diff --git a/src/schema/facet_options.rs b/src/schema/facet_options.rs index f5d78042c..078153b2f 100644 --- a/src/schema/facet_options.rs +++ b/src/schema/facet_options.rs @@ -1,7 +1,9 @@ -use crate::schema::flags::{IndexedFlag, SchemaFlagList, StoredFlag}; -use serde::{Deserialize, Serialize}; use std::ops::BitOr; +use serde::{Deserialize, Serialize}; + +use crate::schema::flags::{IndexedFlag, SchemaFlagList, StoredFlag}; + /// Define how a facet field should be handled by tantivy. /// /// Note that a Facet is always indexed and stored as a fastfield. diff --git a/src/schema/field.rs b/src/schema/field.rs index ee8348e3c..1b42a6b0e 100644 --- a/src/schema/field.rs +++ b/src/schema/field.rs @@ -1,7 +1,7 @@ -use common::BinarySerializable; use std::io; -use std::io::Read; -use std::io::Write; +use std::io::{Read, Write}; + +use common::BinarySerializable; /// `Field` is represented by an unsigned 32-bit integer type /// The schema holds the mapping between field names and `Field` objects. diff --git a/src/schema/field_entry.rs b/src/schema/field_entry.rs index c6271944a..099f0ed27 100644 --- a/src/schema/field_entry.rs +++ b/src/schema/field_entry.rs @@ -1,10 +1,7 @@ -use crate::schema::FacetOptions; -use crate::schema::TextOptions; -use crate::schema::{is_valid_field_name, IntOptions}; +use serde::{Deserialize, Serialize}; use crate::schema::bytes_options::BytesOptions; -use crate::schema::FieldType; -use serde::{Deserialize, Serialize}; +use crate::schema::{is_valid_field_name, FacetOptions, FieldType, IntOptions, TextOptions}; /// A `FieldEntry` represents a field and its configuration. /// `Schema` are a collection of `FieldEntry` @@ -146,13 +143,12 @@ impl FieldEntry { #[cfg(test)] mod tests { - use super::*; - use crate::{ - schema::{Schema, TextFieldIndexing, TEXT}, - Index, - }; use serde_json; + use super::*; + use crate::schema::{Schema, TextFieldIndexing, TEXT}; + use crate::Index; + #[test] #[should_panic] fn test_invalid_field_name_should_panic() { diff --git a/src/schema/field_type.rs b/src/schema/field_type.rs index 3b1a87fe5..bd6c8fa9d 100644 --- a/src/schema/field_type.rs +++ b/src/schema/field_type.rs @@ -1,15 +1,12 @@ -use crate::schema::bytes_options::BytesOptions; -use crate::schema::facet_options::FacetOptions; -use crate::schema::Facet; -use crate::schema::IndexRecordOption; -use crate::schema::TextFieldIndexing; -use crate::schema::Value; -use crate::schema::{IntOptions, TextOptions}; -use crate::tokenizer::PreTokenizedString; use chrono::{FixedOffset, Utc}; use serde::{Deserialize, Serialize}; use serde_json::Value as JsonValue; +use crate::schema::bytes_options::BytesOptions; +use crate::schema::facet_options::FacetOptions; +use crate::schema::{Facet, IndexRecordOption, IntOptions, TextFieldIndexing, TextOptions, Value}; +use crate::tokenizer::PreTokenizedString; + /// Possible error that may occur while parsing a field value /// At this point the JSON is known to be valid. #[derive(Debug, PartialEq)] @@ -192,12 +189,13 @@ impl FieldType { JsonValue::String(ref field_text) => match *self { FieldType::Date(_) => { let dt_with_fixed_tz: chrono::DateTime = - chrono::DateTime::parse_from_rfc3339(field_text).map_err(|err| + chrono::DateTime::parse_from_rfc3339(field_text).map_err(|err| { ValueParsingError::TypeError(format!( - "Failed to parse date from JSON. Expected rfc3339 format, got {}. {:?}", + "Failed to parse date from JSON. Expected rfc3339 format, got {}. \ + {:?}", field_text, err )) - )?; + })?; Ok(Value::Date(dt_with_fixed_tz.with_timezone(&Utc))) } FieldType::Str(_) => Ok(Value::Str(field_text.clone())), @@ -277,15 +275,13 @@ impl FieldType { #[cfg(test)] mod tests { + use chrono::{NaiveDate, NaiveDateTime, NaiveTime, Utc}; + use super::FieldType; use crate::schema::field_type::ValueParsingError; - use crate::schema::TextOptions; - use crate::schema::Type; - use crate::schema::Value; - use crate::schema::{Schema, INDEXED}; + use crate::schema::{Schema, TextOptions, Type, Value, INDEXED}; use crate::tokenizer::{PreTokenizedString, Token}; use crate::{DateTime, Document}; - use chrono::{NaiveDate, NaiveDateTime, NaiveTime, Utc}; #[test] fn test_deserialize_json_date() { diff --git a/src/schema/field_value.rs b/src/schema/field_value.rs index 80160afca..b3ac6fd6a 100644 --- a/src/schema/field_value.rs +++ b/src/schema/field_value.rs @@ -1,8 +1,9 @@ -use crate::schema::Field; -use crate::schema::Value; -use common::BinarySerializable; use std::io::{self, Read, Write}; +use common::BinarySerializable; + +use crate::schema::{Field, Value}; + /// `FieldValue` holds together a `Field` and its `Value`. #[derive(Debug, Clone, PartialEq, Eq, serde::Serialize, serde::Deserialize)] pub struct FieldValue { diff --git a/src/schema/flags.rs b/src/schema/flags.rs index 106538aec..f60cab322 100644 --- a/src/schema/flags.rs +++ b/src/schema/flags.rs @@ -1,7 +1,7 @@ -use crate::schema::IntOptions; -use crate::schema::TextOptions; use std::ops::BitOr; +use crate::schema::{IntOptions, TextOptions}; + #[derive(Clone)] pub struct StoredFlag; /// Flag to mark the field as stored. diff --git a/src/schema/index_record_option.rs b/src/schema/index_record_option.rs index e3a8dfbe3..8648b1117 100644 --- a/src/schema/index_record_option.rs +++ b/src/schema/index_record_option.rs @@ -13,7 +13,6 @@ use serde::{Deserialize, Serialize}; /// amount of information to be decoded as one goes through a posting list. /// (See [`InvertedIndexReader.read_postings`]( /// ../struct.InvertedIndexReader.html#method.read_postings)) -/// #[derive(Clone, Copy, Debug, PartialEq, PartialOrd, Ord, Eq, Hash, Serialize, Deserialize)] pub enum IndexRecordOption { /// records only the `DocId`s diff --git a/src/schema/int_options.rs b/src/schema/int_options.rs index fcd5ff7dc..d13cb5d4d 100644 --- a/src/schema/int_options.rs +++ b/src/schema/int_options.rs @@ -1,7 +1,9 @@ -use crate::schema::flags::{FastFlag, IndexedFlag, SchemaFlagList, StoredFlag}; -use serde::{Deserialize, Serialize}; use std::ops::BitOr; +use serde::{Deserialize, Serialize}; + +use crate::schema::flags::{FastFlag, IndexedFlag, SchemaFlagList, StoredFlag}; + /// Express whether a field is single-value or multi-valued. #[derive(Clone, Copy, PartialEq, Eq, Debug, Serialize, Deserialize)] pub enum Cardinality { diff --git a/src/schema/mod.rs b/src/schema/mod.rs index b1ea12e27..e64ccc438 100644 --- a/src/schema/mod.rs +++ b/src/schema/mod.rs @@ -1,106 +1,104 @@ -/*! -Schema definition for tantivy's indices. - -# Setting your schema in Tantivy - -Tantivy has a very strict schema. -The schema defines information about the fields your index contains, that is, for each field: - -* the field name (may only contain letters `[a-zA-Z]`, number `[0-9]`, and `_`) -* the type of the field (currently only `text` and `u64` are supported) -* how the field should be indexed / stored. - -This very last point is critical as it will enable / disable some of the functionality -for your index. - -Tantivy's schema is stored within the `meta.json` file at the root of your -directory. - - - -# Building a schema "programmatically" - - -## Setting a text field - -### Example - -``` -use tantivy::schema::*; -let mut schema_builder = Schema::builder(); -let title_options = TextOptions::default() - .set_stored() - .set_indexing_options(TextFieldIndexing::default() - .set_tokenizer("default") - .set_index_option(IndexRecordOption::WithFreqsAndPositions)); -schema_builder.add_text_field("title", title_options); -let schema = schema_builder.build(); -``` - -We can split the problem of generating a search result page into two phases : - -* identifying the list of 10 or so documents to be displayed (Conceptually `query -> doc_ids[]`) -* for each of these documents, retrieving the information required to generate - the search results page. (`doc_ids[] -> Document[]`) - -In the first phase, the ability to search for documents by the given field is determined by the -[`IndexRecordOption`](enum.IndexRecordOption.html) of our -[`TextOptions`](struct.TextOptions.html). - -The effect of each possible setting is described more in detail -[`TextIndexingOptions`](enum.TextIndexingOptions.html). - -On the other hand setting the field as stored or not determines whether the field should be returned -when [`searcher.doc(doc_address)`](../struct.Searcher.html#method.doc) is called. - - -## Setting a u64, a i64 or a f64 field - -### Example - -``` -use tantivy::schema::*; -let mut schema_builder = Schema::builder(); -let num_stars_options = IntOptions::default() - .set_stored() - .set_indexed(); -schema_builder.add_u64_field("num_stars", num_stars_options); -let schema = schema_builder.build(); -``` - -Just like for Text fields (see above), -setting the field as stored defines whether the field will be -returned when [`searcher.doc(doc_address)`](../struct.Searcher.html#method.doc) is called, -and setting the field as indexed means that we will be able perform queries such as `num_stars:10`. -Note that unlike text fields, u64 can only be indexed in one way for the moment. -This may change when we will start supporting range queries. - -The `fast` option on the other hand is specific to u64 fields, and is only relevant -if you are implementing your own queries. This functionality is somewhat similar to Lucene's -`DocValues`. - -u64 that are indexed as fast will be stored in a special data structure that will -make it possible to access the u64 value given the doc id rapidly. This is useful if the value of -the field is required during scoring or collection for instance. - - -### Shortcuts - - -For convenience, it is possible to define your field indexing options by combining different flags -using the `|` operator. - -For instance, a schema containing the two fields defined in the example above could be rewritten : - -``` -use tantivy::schema::*; -let mut schema_builder = Schema::builder(); -schema_builder.add_u64_field("num_stars", INDEXED | STORED); -schema_builder.add_text_field("title", TEXT | STORED); -let schema = schema_builder.build(); -``` - -*/ +//! Schema definition for tantivy's indices. +//! +//! # Setting your schema in Tantivy +//! +//! Tantivy has a very strict schema. +//! The schema defines information about the fields your index contains, that is, for each field: +//! +//! the field name (may only contain letters `[a-zA-Z]`, number `[0-9]`, and `_`) +//! the type of the field (currently only `text` and `u64` are supported) +//! how the field should be indexed / stored. +//! +//! This very last point is critical as it will enable / disable some of the functionality +//! for your index. +//! +//! Tantivy's schema is stored within the `meta.json` file at the root of your +//! directory. +//! +//! +//! +//! # Building a schema "programmatically" +//! +//! +//! ## Setting a text field +//! +//! ### Example +//! +//! ``` +//! use tantivy::schema::*; +//! let mut schema_builder = Schema::builder(); +//! let title_options = TextOptions::default() +//! .set_stored() +//! .set_indexing_options(TextFieldIndexing::default() +//! .set_tokenizer("default") +//! .set_index_option(IndexRecordOption::WithFreqsAndPositions)); +//! schema_builder.add_text_field("title", title_options); +//! let schema = schema_builder.build(); +//! ``` +//! +//! We can split the problem of generating a search result page into two phases : +//! +//! identifying the list of 10 or so documents to be displayed (Conceptually `query -> doc_ids[]`) +//! for each of these documents, retrieving the information required to generate +//! the search results page. (`doc_ids[] -> Document[]`) +//! +//! In the first phase, the ability to search for documents by the given field is determined by the +//! [`IndexRecordOption`](enum.IndexRecordOption.html) of our +//! [`TextOptions`](struct.TextOptions.html). +//! +//! The effect of each possible setting is described more in detail +//! [`TextIndexingOptions`](enum.TextIndexingOptions.html). +//! +//! On the other hand setting the field as stored or not determines whether the field should be +//! returned when [`searcher.doc(doc_address)`](../struct.Searcher.html#method.doc) is called. +//! +//! +//! ## Setting a u64, a i64 or a f64 field +//! +//! ### Example +//! +//! ``` +//! use tantivy::schema::*; +//! let mut schema_builder = Schema::builder(); +//! let num_stars_options = IntOptions::default() +//! .set_stored() +//! .set_indexed(); +//! schema_builder.add_u64_field("num_stars", num_stars_options); +//! let schema = schema_builder.build(); +//! ``` +//! +//! Just like for Text fields (see above), +//! setting the field as stored defines whether the field will be +//! returned when [`searcher.doc(doc_address)`](../struct.Searcher.html#method.doc) is called, +//! and setting the field as indexed means that we will be able perform queries such as +//! `num_stars:10`. Note that unlike text fields, u64 can only be indexed in one way for the moment. +//! This may change when we will start supporting range queries. +//! +//! The `fast` option on the other hand is specific to u64 fields, and is only relevant +//! if you are implementing your own queries. This functionality is somewhat similar to Lucene's +//! `DocValues`. +//! +//! u64 that are indexed as fast will be stored in a special data structure that will +//! make it possible to access the u64 value given the doc id rapidly. This is useful if the value +//! of the field is required during scoring or collection for instance. +//! +//! +//! ### Shortcuts +//! +//! +//! For convenience, it is possible to define your field indexing options by combining different +//! flags using the `|` operator. +//! +//! For instance, a schema containing the two fields defined in the example above could be rewritten +//! : +//! +//! ``` +//! use tantivy::schema::*; +//! let mut schema_builder = Schema::builder(); +//! schema_builder.add_u64_field("num_stars", INDEXED | STORED); +//! schema_builder.add_text_field("title", TEXT | STORED); +//! let schema = schema_builder.build(); +//! ``` mod document; mod facet; @@ -122,34 +120,23 @@ mod value; mod flags; -pub use self::named_field_document::NamedFieldDocument; -pub use self::schema::DocParsingError; -pub use self::schema::{Schema, SchemaBuilder}; -pub use self::value::Value; - -pub use self::facet::Facet; -pub use self::facet::FacetParseError; -pub(crate) use self::facet::FACET_SEP_BYTE; -pub use self::facet_options::FacetOptions; - +pub use self::bytes_options::BytesOptions; pub use self::document::Document; +pub(crate) use self::facet::FACET_SEP_BYTE; +pub use self::facet::{Facet, FacetParseError}; +pub use self::facet_options::FacetOptions; pub use self::field::Field; -pub use self::term::Term; - pub use self::field_entry::FieldEntry; pub use self::field_type::{FieldType, Type}; pub use self::field_value::FieldValue; - -pub use self::index_record_option::IndexRecordOption; -pub use self::text_options::TextFieldIndexing; -pub use self::text_options::TextOptions; -pub use self::text_options::STRING; -pub use self::text_options::TEXT; - -pub use self::bytes_options::BytesOptions; pub use self::flags::{FAST, INDEXED, STORED}; -pub use self::int_options::Cardinality; -pub use self::int_options::IntOptions; +pub use self::index_record_option::IndexRecordOption; +pub use self::int_options::{Cardinality, IntOptions}; +pub use self::named_field_document::NamedFieldDocument; +pub use self::schema::{DocParsingError, Schema, SchemaBuilder}; +pub use self::term::Term; +pub use self::text_options::{TextFieldIndexing, TextOptions, STRING, TEXT}; +pub use self::value::Value; /// Validator for a potential `field_name`. /// Returns true if the name can be use for a field name. diff --git a/src/schema/named_field_document.rs b/src/schema/named_field_document.rs index 3a69df632..9f7d09fae 100644 --- a/src/schema/named_field_document.rs +++ b/src/schema/named_field_document.rs @@ -1,12 +1,13 @@ -use crate::schema::Value; -use serde::{Deserialize, Serialize}; use std::collections::BTreeMap; +use serde::{Deserialize, Serialize}; + +use crate::schema::Value; + /// Internal representation of a document used for JSON /// serialization. /// /// A `NamedFieldDocument` is a simple representation of a document /// as a `BTreeMap>`. -/// #[derive(Debug, Deserialize, Serialize)] pub struct NamedFieldDocument(pub BTreeMap>); diff --git a/src/schema/schema.rs b/src/schema/schema.rs index c6e35e2b4..decc6ae00 100644 --- a/src/schema/schema.rs +++ b/src/schema/schema.rs @@ -1,15 +1,15 @@ -use crate::schema::field_type::ValueParsingError; -use std::collections::BTreeMap; -use std::collections::HashMap; +use std::collections::{BTreeMap, HashMap}; +use std::fmt; use std::sync::Arc; -use super::*; -use crate::schema::bytes_options::BytesOptions; use serde::de::{SeqAccess, Visitor}; use serde::ser::SerializeSeq; use serde::{Deserialize, Deserializer, Serialize, Serializer}; use serde_json::{self, Map as JsonObject, Value as JsonValue}; -use std::fmt; + +use super::*; +use crate::schema::bytes_options::BytesOptions; +use crate::schema::field_type::ValueParsingError; /// Tantivy has a very strict schema. /// You need to specify in advance whether a field is indexed or not, @@ -29,7 +29,6 @@ use std::fmt; /// let title_field = schema_builder.add_text_field("title", TEXT); /// let body_field = schema_builder.add_text_field("body", TEXT); /// let schema = schema_builder.build(); -/// /// ``` #[derive(Default)] pub struct SchemaBuilder { @@ -224,7 +223,6 @@ impl Eq for InnerSchema {} /// let title_field = schema_builder.add_text_field("title", TEXT); /// let body_field = schema_builder.add_text_field("body", TEXT); /// let schema = schema_builder.build(); -/// /// ``` #[derive(Clone, Eq, PartialEq, Debug)] pub struct Schema(Arc); @@ -286,11 +284,7 @@ impl Schema { let mut field_map = BTreeMap::new(); for (field, field_values) in doc.get_sorted_field_values() { let field_name = self.get_field_name(field); - let values: Vec = field_values - .into_iter() - .map(FieldValue::value) - .cloned() - .collect(); + let values: Vec = field_values.into_iter().cloned().collect(); field_map.insert(field_name.to_string(), values); } NamedFieldDocument(field_map) @@ -344,9 +338,7 @@ impl Schema { impl Serialize for Schema { fn serialize(&self, serializer: S) -> Result - where - S: Serializer, - { + where S: Serializer { let mut seq = serializer.serialize_seq(Some(self.0.fields.len()))?; for e in &self.0.fields { seq.serialize_element(e)?; @@ -357,9 +349,7 @@ impl Serialize for Schema { impl<'de> Deserialize<'de> for Schema { fn deserialize(deserializer: D) -> Result - where - D: Deserializer<'de>, - { + where D: Deserializer<'de> { struct SchemaVisitor; impl<'de> Visitor<'de> for SchemaVisitor { @@ -370,9 +360,7 @@ impl<'de> Deserialize<'de> for Schema { } fn visit_seq(self, mut seq: A) -> Result - where - A: SeqAccess<'de>, - { + where A: SeqAccess<'de> { let mut schema = SchemaBuilder { fields: Vec::with_capacity(seq.size_hint().unwrap_or(0)), fields_map: HashMap::with_capacity(seq.size_hint().unwrap_or(0)), @@ -405,13 +393,15 @@ pub enum DocParsingError { #[cfg(test)] mod tests { + use std::collections::BTreeMap; + + use matches::{assert_matches, matches}; + use serde_json; + use crate::schema::field_type::ValueParsingError; use crate::schema::int_options::Cardinality::SingleValue; use crate::schema::schema::DocParsingError::NotJson; use crate::schema::*; - use matches::{assert_matches, matches}; - use serde_json; - use std::collections::BTreeMap; #[test] pub fn is_indexed_test() { @@ -636,20 +626,17 @@ mod tests { }"#, ) .unwrap(); - assert_eq!(doc.get_first(title_field).unwrap().text(), Some("my title")); assert_eq!( - doc.get_first(author_field).unwrap().text(), + doc.get_first(title_field).unwrap().as_text(), + Some("my title") + ); + assert_eq!( + doc.get_first(author_field).unwrap().as_text(), Some("fulmicoton") ); - assert_eq!(doc.get_first(count_field).unwrap().u64_value(), Some(4)); - assert_eq!( - doc.get_first(popularity_field).unwrap().i64_value(), - Some(10) - ); - assert_eq!( - doc.get_first(score_field).unwrap().f64_value(), - Some(80.5f64) - ); + assert_eq!(doc.get_first(count_field).unwrap().as_u64(), Some(4)); + assert_eq!(doc.get_first(popularity_field).unwrap().as_i64(), Some(10)); + assert_eq!(doc.get_first(score_field).unwrap().as_f64(), Some(80.5f64)); } { let res = schema.parse_document( diff --git a/src/schema/term.rs b/src/schema/term.rs index 9a05ad4c4..c6e3cfbb0 100644 --- a/src/schema/term.rs +++ b/src/schema/term.rs @@ -1,11 +1,10 @@ -use std::fmt; use std::hash::{Hash, Hasher}; +use std::{fmt, str}; use super::Field; use crate::fastfield::FastValue; use crate::schema::{Facet, Type}; use crate::DateTime; -use std::str; /// Size (in bytes) of the buffer of a fast value (u64, i64, f64, or date) term. /// + + @@ -16,8 +15,7 @@ const FAST_VALUE_TERM_LEN: usize = 4 + 1 + 8; /// It actually wraps a `Vec`. #[derive(Clone)] pub struct Term>(B) -where - B: AsRef<[u8]>; +where B: AsRef<[u8]>; impl Term { pub(crate) fn new() -> Term { @@ -125,8 +123,7 @@ impl Term { } impl Ord for Term -where - B: AsRef<[u8]>, +where B: AsRef<[u8]> { fn cmp(&self, other: &Self) -> std::cmp::Ordering { self.as_slice().cmp(other.as_slice()) @@ -134,8 +131,7 @@ where } impl PartialOrd for Term -where - B: AsRef<[u8]>, +where B: AsRef<[u8]> { fn partial_cmp(&self, other: &Self) -> Option { Some(self.cmp(other)) @@ -143,8 +139,7 @@ where } impl PartialEq for Term -where - B: AsRef<[u8]>, +where B: AsRef<[u8]> { fn eq(&self, other: &Self) -> bool { self.as_slice() == other.as_slice() @@ -154,8 +149,7 @@ where impl Eq for Term where B: AsRef<[u8]> {} impl Hash for Term -where - B: AsRef<[u8]>, +where B: AsRef<[u8]> { fn hash(&self, state: &mut H) { self.0.as_ref().hash(state) @@ -163,8 +157,7 @@ where } impl Term -where - B: AsRef<[u8]>, +where B: AsRef<[u8]> { /// Wraps a object holding bytes pub fn wrap(data: B) -> Term { diff --git a/src/schema/text_options.rs b/src/schema/text_options.rs index 3eb0b340b..4c1810870 100644 --- a/src/schema/text_options.rs +++ b/src/schema/text_options.rs @@ -1,10 +1,11 @@ -use crate::schema::flags::SchemaFlagList; -use crate::schema::flags::StoredFlag; -use crate::schema::IndexRecordOption; -use serde::{Deserialize, Serialize}; use std::borrow::Cow; use std::ops::BitOr; +use serde::{Deserialize, Serialize}; + +use crate::schema::flags::{SchemaFlagList, StoredFlag}; +use crate::schema::IndexRecordOption; + /// Define how a text field should be handled by tantivy. #[derive(Clone, Debug, PartialEq, Serialize, Deserialize, Default)] pub struct TextOptions { @@ -40,7 +41,8 @@ impl TextOptions { /// /// It defines /// - the amount of information that should be stored about the presence of a term in a document. -/// Essentially, should we store the term frequency and/or the positions (See [`IndexRecordOption`](./enum.IndexRecordOption.html)). +/// Essentially, should we store the term frequency and/or the positions (See +/// [`IndexRecordOption`](./enum.IndexRecordOption.html)). /// - the name of the `Tokenizer` that should be used to process the field. #[derive(Clone, PartialEq, Debug, Serialize, Deserialize)] pub struct TextFieldIndexing { diff --git a/src/schema/value.rs b/src/schema/value.rs index 426643d79..df83930d6 100644 --- a/src/schema/value.rs +++ b/src/schema/value.rs @@ -1,9 +1,11 @@ +use std::fmt; + +use serde::de::Visitor; +use serde::{Deserialize, Deserializer, Serialize, Serializer}; + use crate::schema::Facet; use crate::tokenizer::PreTokenizedString; use crate::DateTime; -use serde::de::Visitor; -use serde::{Deserialize, Deserializer, Serialize, Serializer}; -use std::fmt; /// Value represents the value of a any field. /// It is an enum over all over all of the possible field type. @@ -31,9 +33,7 @@ impl Eq for Value {} impl Serialize for Value { fn serialize(&self, serializer: S) -> Result - where - S: Serializer, - { + where S: Serializer { match *self { Value::Str(ref v) => serializer.serialize_str(v), Value::PreTokStr(ref v) => v.serialize(serializer), @@ -49,9 +49,7 @@ impl Serialize for Value { impl<'de> Deserialize<'de> for Value { fn deserialize(deserializer: D) -> Result - where - D: Deserializer<'de>, - { + where D: Deserializer<'de> { struct ValueVisitor; impl<'de> Visitor<'de> for ValueVisitor { @@ -89,7 +87,7 @@ impl<'de> Deserialize<'de> for Value { impl Value { /// Returns the text value, provided the value is of the `Str` type. /// (Returns None if the value is not of the `Str` type). - pub fn text(&self) -> Option<&str> { + pub fn as_text(&self) -> Option<&str> { if let Value::Str(text) = self { Some(text) } else { @@ -99,7 +97,7 @@ impl Value { /// Returns the facet value, provided the value is of the `Facet` type. /// (Returns None if the value is not of the `Facet` type). - pub fn facet(&self) -> Option<&Facet> { + pub fn as_facet(&self) -> Option<&Facet> { if let Value::Facet(facet) = self { Some(facet) } else { @@ -119,7 +117,7 @@ impl Value { /// Returns the u64-value, provided the value is of the `U64` type. /// (Returns None if the value is not of the `U64` type) - pub fn u64_value(&self) -> Option { + pub fn as_u64(&self) -> Option { if let Value::U64(val) = self { Some(*val) } else { @@ -130,7 +128,7 @@ impl Value { /// Returns the i64-value, provided the value is of the `I64` type. /// /// Return None if the value is not of type `I64`. - pub fn i64_value(&self) -> Option { + pub fn as_i64(&self) -> Option { if let Value::I64(val) = self { Some(*val) } else { @@ -141,7 +139,7 @@ impl Value { /// Returns the f64-value, provided the value is of the `F64` type. /// /// Return None if the value is not of type `F64`. - pub fn f64_value(&self) -> Option { + pub fn as_f64(&self) -> Option { if let Value::F64(value) = self { Some(*value) } else { @@ -152,7 +150,7 @@ impl Value { /// Returns the Date-value, provided the value is of the `Date` type. /// /// Returns None if the value is not of type `Date`. - pub fn date_value(&self) -> Option<&DateTime> { + pub fn as_date(&self) -> Option<&DateTime> { if let Value::Date(date) = self { Some(date) } else { @@ -163,7 +161,7 @@ impl Value { /// Returns the Bytes-value, provided the value is of the `Bytes` type. /// /// Returns None if the value is not of type `Bytes`. - pub fn bytes_value(&self) -> Option<&[u8]> { + pub fn as_bytes(&self) -> Option<&[u8]> { if let Value::Bytes(bytes) = self { Some(bytes) } else { @@ -233,12 +231,14 @@ impl From for Value { } mod binary_serialize { + use std::io::{self, Read, Write}; + + use chrono::{TimeZone, Utc}; + use common::{f64_to_u64, u64_to_f64, BinarySerializable}; + use super::Value; use crate::schema::Facet; use crate::tokenizer::PreTokenizedString; - use chrono::{TimeZone, Utc}; - use common::{f64_to_u64, u64_to_f64, BinarySerializable}; - use std::io::{self, Read, Write}; const TEXT_CODE: u8 = 0; const U64_CODE: u8 = 1; @@ -358,9 +358,10 @@ mod binary_serialize { #[cfg(test)] mod tests { + use std::str::FromStr; + use super::Value; use crate::DateTime; - use std::str::FromStr; #[test] fn test_serialize_date() { diff --git a/src/snippet/mod.rs b/src/snippet/mod.rs index 2dc98ab39..c67df021d 100644 --- a/src/snippet/mod.rs +++ b/src/snippet/mod.rs @@ -1,14 +1,14 @@ -use crate::query::Query; -use crate::schema::Field; -use crate::schema::Value; -use crate::tokenizer::{TextAnalyzer, Token}; -use crate::Searcher; -use crate::{Document, Score}; -use htmlescape::encode_minimal; use std::cmp::Ordering; use std::collections::BTreeMap; use std::ops::Range; +use htmlescape::encode_minimal; + +use crate::query::Query; +use crate::schema::{Field, Value}; +use crate::tokenizer::{TextAnalyzer, Token}; +use crate::{Document, Score, Searcher}; + const DEFAULT_MAX_NUM_CHARS: usize = 150; #[derive(Debug)] @@ -280,7 +280,7 @@ impl SnippetGenerator { pub fn snippet_from_doc(&self, doc: &Document) -> Snippet { let text: String = doc .get_all(self.field) - .flat_map(Value::text) + .flat_map(Value::as_text) .collect::>() .join(" "); self.snippet(&text) @@ -296,14 +296,15 @@ impl SnippetGenerator { #[cfg(test)] mod tests { + use std::collections::BTreeMap; + + use maplit::btreemap; + use super::{search_fragments, select_best_fragment_combination}; use crate::query::QueryParser; use crate::schema::{IndexRecordOption, Schema, TextFieldIndexing, TextOptions, TEXT}; use crate::tokenizer::SimpleTokenizer; - use crate::Index; - use crate::SnippetGenerator; - use maplit::btreemap; - use std::collections::BTreeMap; + use crate::{Index, SnippetGenerator}; const TEST_TEXT: &str = r#"Rust is a systems programming language sponsored by Mozilla which describes it as a "safe, concurrent, practical language", supporting functional and @@ -335,13 +336,13 @@ Survey in 2016, 2017, and 2018."#; let snippet = select_best_fragment_combination(&fragments[..], TEST_TEXT); assert_eq!( snippet.fragment, - "Rust is a systems programming language sponsored by\n\ - Mozilla which describes it as a \"safe" + "Rust is a systems programming language sponsored by\nMozilla which describes it as a \ + \"safe" ); assert_eq!( snippet.to_html(), - "Rust is a systems programming language \ - sponsored by\nMozilla which describes it as a "safe" + "Rust is a systems programming language sponsored by\nMozilla which \ + describes it as a "safe" ) } @@ -367,7 +368,7 @@ Survey in 2016, 2017, and 2018."#; String::from("language") => 1.0 }; let fragments = search_fragments(&From::from(SimpleTokenizer), TEST_TEXT, &terms, 20); - //assert_eq!(fragments.len(), 7); + // assert_eq!(fragments.len(), 7); { let first = &fragments[0]; assert_eq!(first.score, 0.9); @@ -547,12 +548,21 @@ Survey in 2016, 2017, and 2018."#; SnippetGenerator::create(&searcher, &*query, text_field).unwrap(); { let snippet = snippet_generator.snippet(TEST_TEXT); - assert_eq!(snippet.to_html(), "imperative-procedural paradigms. Rust is syntactically similar to C++[according to whom?],\nbut its designers intend it to provide better memory safety"); + assert_eq!( + snippet.to_html(), + "imperative-procedural paradigms. Rust is syntactically similar to \ + C++[according to whom?],\nbut its designers intend it to provide better \ + memory safety" + ); } { snippet_generator.set_max_num_chars(90); let snippet = snippet_generator.snippet(TEST_TEXT); - assert_eq!(snippet.to_html(), "Rust is syntactically similar to C++[according to whom?],\nbut its designers intend it to"); + assert_eq!( + snippet.to_html(), + "Rust is syntactically similar to C++[according to whom?],\nbut its \ + designers intend it to" + ); } Ok(()) } diff --git a/src/space_usage/mod.rs b/src/space_usage/mod.rs index 7bf2fc84d..d7262b2d1 100644 --- a/src/space_usage/mod.rs +++ b/src/space_usage/mod.rs @@ -1,18 +1,18 @@ -/*! -Representations for the space usage of various parts of a Tantivy index. +//! Representations for the space usage of various parts of a Tantivy index. +//! +//! This can be used programmatically, and will also be exposed in a human readable fashion in +//! tantivy-cli. +//! +//! One important caveat for all of this functionality is that none of it currently takes +//! storage-level details into consideration. For example, if your file system block size is 4096 +//! bytes, we can under-count actual resultant space usage by up to 4095 bytes per file. -This can be used programmatically, and will also be exposed in a human readable fashion in -tantivy-cli. +use std::collections::HashMap; -One important caveat for all of this functionality is that none of it currently takes storage-level -details into consideration. For example, if your file system block size is 4096 bytes, we can -under-count actual resultant space usage by up to 4095 bytes per file. -*/ +use serde::{Deserialize, Serialize}; use crate::schema::Field; use crate::SegmentComponent; -use serde::{Deserialize, Serialize}; -use std::collections::HashMap; /// Indicates space usage in bytes pub type ByteCount = usize; @@ -286,11 +286,8 @@ impl FieldUsage { #[cfg(test)] mod test { use crate::core::Index; - use crate::schema::Field; - use crate::schema::Schema; - use crate::schema::{FAST, INDEXED, STORED, TEXT}; - use crate::space_usage::ByteCount; - use crate::space_usage::PerFieldSpaceUsage; + use crate::schema::{Field, Schema, FAST, INDEXED, STORED, TEXT}; + use crate::space_usage::{ByteCount, PerFieldSpaceUsage}; use crate::Term; #[test] diff --git a/src/store/compression_lz4_block.rs b/src/store/compression_lz4_block.rs index 532eed2fa..0464510b8 100644 --- a/src/store/compression_lz4_block.rs +++ b/src/store/compression_lz4_block.rs @@ -1,9 +1,9 @@ -use std::io::{self}; - use core::convert::TryInto; -use lz4_flex::{compress_into, decompress_into}; +use std::io::{self}; use std::mem; +use lz4_flex::{compress_into, decompress_into}; + #[inline] #[allow(clippy::uninit_vec)] pub fn compress(uncompressed: &[u8], compressed: &mut Vec) -> io::Result<()> { diff --git a/src/store/compressors.rs b/src/store/compressors.rs index 3c052ac9d..f386d286e 100644 --- a/src/store/compressors.rs +++ b/src/store/compressors.rs @@ -1,6 +1,7 @@ -use serde::{Deserialize, Serialize}; use std::io; +use serde::{Deserialize, Serialize}; + pub trait StoreCompressor { fn compress(&self, uncompressed: &[u8], compressed: &mut Vec) -> io::Result<()>; fn decompress(&self, compressed: &[u8], decompressed: &mut Vec) -> io::Result<()>; diff --git a/src/store/footer.rs b/src/store/footer.rs index 6f63f8170..102fd675b 100644 --- a/src/store/footer.rs +++ b/src/store/footer.rs @@ -1,7 +1,10 @@ -use crate::{directory::FileSlice, store::Compressor}; -use common::{BinarySerializable, FixedSize, HasLen}; use std::io; +use common::{BinarySerializable, FixedSize, HasLen}; + +use crate::directory::FileSlice; +use crate::store::Compressor; + #[derive(Debug, Clone, PartialEq)] pub struct DocStoreFooter { pub offset: u64, @@ -10,7 +13,7 @@ pub struct DocStoreFooter { /// Serialises the footer to a byte-array /// - offset : 8 bytes -///- compressor id: 1 byte +/// - compressor id: 1 byte /// - reserved for future use: 15 bytes impl BinarySerializable for DocStoreFooter { fn serialize(&self, writer: &mut W) -> io::Result<()> { @@ -61,6 +64,7 @@ impl DocStoreFooter { #[test] fn doc_store_footer_test() { // This test is just to safe guard changes on the footer. - // When the doc store footer is updated, make sure to update also the serialize/deserialize methods + // When the doc store footer is updated, make sure to update also the serialize/deserialize + // methods assert_eq!(core::mem::size_of::(), 16); } diff --git a/src/store/index/block.rs b/src/store/index/block.rs index 5915f1e13..991ef998a 100644 --- a/src/store/index/block.rs +++ b/src/store/index/block.rs @@ -1,9 +1,11 @@ -use crate::store::index::{Checkpoint, CHECKPOINT_PERIOD}; -use crate::DocId; -use common::VInt; use std::io; use std::ops::Range; +use common::VInt; + +use crate::store::index::{Checkpoint, CHECKPOINT_PERIOD}; +use crate::DocId; + /// Represents a block of checkpoints. /// /// The DocStore index checkpoints are organized into block @@ -105,10 +107,11 @@ impl CheckpointBlock { #[cfg(test)] mod tests { + use std::io; + use crate::store::index::block::CheckpointBlock; use crate::store::index::Checkpoint; use crate::DocId; - use std::io; fn test_aux_ser_deser(checkpoints: &[Checkpoint]) -> io::Result<()> { let mut block = CheckpointBlock::default(); diff --git a/src/store/index/mod.rs b/src/store/index/mod.rs index 6535ff571..7d8b8606a 100644 --- a/src/store/index/mod.rs +++ b/src/store/index/mod.rs @@ -6,10 +6,9 @@ mod block; mod skip_index; mod skip_index_builder; -use crate::DocId; - pub use self::skip_index::SkipIndex; pub use self::skip_index_builder::SkipIndexBuilder; +use crate::DocId; /// A checkpoint contains meta-information about /// a block. Either a block of documents, or another block @@ -45,14 +44,13 @@ mod tests { use futures::executor::block_on; use proptest::strategy::{BoxedStrategy, Strategy}; + use super::{SkipIndex, SkipIndexBuilder}; use crate::directory::OwnedBytes; use crate::indexer::NoMergePolicy; use crate::schema::{SchemaBuilder, STORED, TEXT}; use crate::store::index::Checkpoint; use crate::{DocAddress, DocId, Index, Term}; - use super::{SkipIndex, SkipIndexBuilder}; - #[test] fn test_skip_index_empty() -> io::Result<()> { let mut output: Vec = Vec::new(); diff --git a/src/store/index/skip_index.rs b/src/store/index/skip_index.rs index 4eb3e3b32..5eafb2814 100644 --- a/src/store/index/skip_index.rs +++ b/src/store/index/skip_index.rs @@ -1,8 +1,9 @@ +use common::{BinarySerializable, VInt}; + use crate::directory::OwnedBytes; use crate::store::index::block::CheckpointBlock; use crate::store::index::Checkpoint; use crate::DocId; -use common::{BinarySerializable, VInt}; pub struct LayerCursor<'a> { remaining: &'a [u8], diff --git a/src/store/index/skip_index_builder.rs b/src/store/index/skip_index_builder.rs index c9e311b92..cbb899a21 100644 --- a/src/store/index/skip_index_builder.rs +++ b/src/store/index/skip_index_builder.rs @@ -1,9 +1,11 @@ -use crate::store::index::block::CheckpointBlock; -use crate::store::index::{Checkpoint, CHECKPOINT_PERIOD}; -use common::{BinarySerializable, VInt}; use std::io; use std::io::Write; +use common::{BinarySerializable, VInt}; + +use crate::store::index::block::CheckpointBlock; +use crate::store::index::{Checkpoint, CHECKPOINT_PERIOD}; + // Each skip contains iterator over pairs (last doc in block, offset to start of block). struct LayerBuilder { diff --git a/src/store/mod.rs b/src/store/mod.rs index 19c2aba2a..6ea7c4795 100644 --- a/src/store/mod.rs +++ b/src/store/mod.rs @@ -1,37 +1,36 @@ -/*! -Compressed/slow/row-oriented storage for documents. - -A field needs to be marked as stored in the schema in -order to be handled in the `Store`. - -Internally, documents (or rather their stored fields) are serialized to a buffer. -When the buffer exceeds 16K, the buffer is compressed using `brotli`, `LZ4` or `snappy` -and the resulting block is written to disk. - -One can then request for a specific `DocId`. -A skip list helps navigating to the right block, -decompresses it entirely and returns the document within it. - -If the last document requested was in the same block, -the reader is smart enough to avoid decompressing -the block a second time, but their is no real -*uncompressed block* cache. - -A typical use case for the store is, once -the search result page has been computed, returning -the actual content of the 10 best document. - -# Usage - -Most users should not access the `StoreReader` directly -and should rely on either - -- at the segment level, the -[`SegmentReader`'s `doc` method](../struct.SegmentReader.html#method.doc) -- at the index level, the -[`Searcher`'s `doc` method](../struct.Searcher.html#method.doc) - -!*/ +//! Compressed/slow/row-oriented storage for documents. +//! +//! A field needs to be marked as stored in the schema in +//! order to be handled in the `Store`. +//! +//! Internally, documents (or rather their stored fields) are serialized to a buffer. +//! When the buffer exceeds 16K, the buffer is compressed using `brotli`, `LZ4` or `snappy` +//! and the resulting block is written to disk. +//! +//! One can then request for a specific `DocId`. +//! A skip list helps navigating to the right block, +//! decompresses it entirely and returns the document within it. +//! +//! If the last document requested was in the same block, +//! the reader is smart enough to avoid decompressing +//! the block a second time, but their is no real +//! uncompressed block* cache. +//! +//! A typical use case for the store is, once +//! the search result page has been computed, returning +//! the actual content of the 10 best document. +//! +//! # Usage +//! +//! Most users should not access the `StoreReader` directly +//! and should rely on either +//! +//! - at the segment level, the +//! [`SegmentReader`'s `doc` method](../struct.SegmentReader.html#method.doc) +//! - at the index level, the +//! [`Searcher`'s `doc` method](../struct.Searcher.html#method.doc) +//! +//! ! mod compressors; mod footer; @@ -54,27 +53,25 @@ mod compression_snap; #[cfg(test)] pub mod tests { + use std::path::Path; + use futures::executor::block_on; use super::*; + use crate::directory::{Directory, RamDirectory, WritePtr}; use crate::fastfield::AliveBitSet; - use crate::schema::{self, FieldValue, TextFieldIndexing, STORED, TEXT}; - use crate::schema::{Document, TextOptions}; - use crate::{ - directory::{Directory, RamDirectory, WritePtr}, - Term, + use crate::schema::{ + self, Document, FieldValue, Schema, TextFieldIndexing, TextOptions, STORED, TEXT, }; - use crate::{schema::Schema, Index}; - use std::path::Path; + use crate::{Index, Term}; - const LOREM: &str = "Doc Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed \ - do eiusmod tempor incididunt ut labore et dolore magna aliqua. \ - Ut enim ad minim veniam, quis nostrud exercitation ullamco \ - laboris nisi ut aliquip ex ea commodo consequat. Duis aute irure \ - dolor in reprehenderit in voluptate velit esse cillum dolore eu \ - fugiat nulla pariatur. Excepteur sint occaecat cupidatat non \ - proident, sunt in culpa qui officia deserunt mollit anim id est \ - laborum."; + const LOREM: &str = "Doc Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do \ + eiusmod tempor incididunt ut labore et dolore magna aliqua. Ut enim ad \ + minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip \ + ex ea commodo consequat. Duis aute irure dolor in reprehenderit in \ + voluptate velit esse cillum dolore eu fugiat nulla pariatur. Excepteur \ + sint occaecat cupidatat non proident, sunt in culpa qui officia deserunt \ + mollit anim id est laborum."; pub fn write_lorem_ipsum_store( writer: WritePtr, @@ -99,7 +96,7 @@ pub mod tests { let field_value = FieldValue::new(field_title, From::from(title_text)); fields.push(field_value); } - //let fields_refs: Vec<&FieldValue> = fields.iter().collect(); + // let fields_refs: Vec<&FieldValue> = fields.iter().collect(); let doc = Document::from(fields); store_writer.store(&doc).unwrap(); } @@ -129,7 +126,7 @@ pub mod tests { .get(i)? .get_first(field_title) .unwrap() - .text() + .as_text() .unwrap(), format!("Doc {}", i) ); @@ -137,7 +134,7 @@ pub mod tests { for (_, doc) in store.iter(Some(&alive_bitset)).enumerate() { let doc = doc?; - let title_content = doc.get_first(field_title).unwrap().text().unwrap(); + let title_content = doc.get_first(field_title).unwrap().as_text().unwrap(); if !title_content.starts_with("Doc ") { panic!("unexpected title_content {}", title_content); } @@ -169,14 +166,14 @@ pub mod tests { .get(i)? .get_first(field_title) .unwrap() - .text() + .as_text() .unwrap(), format!("Doc {}", i) ); } for (i, doc) in store.iter(None).enumerate() { assert_eq!( - *doc?.get_first(field_title).unwrap().text().unwrap(), + *doc?.get_first(field_title).unwrap().as_text().unwrap(), format!("Doc {}", i) ); } @@ -236,7 +233,7 @@ pub mod tests { let store = reader.get_store_reader()?; for doc in store.iter(reader.alive_bitset()) { assert_eq!( - *doc?.get_first(text_field).unwrap().text().unwrap(), + *doc?.get_first(text_field).unwrap().as_text().unwrap(), "deletemenot".to_string() ); } @@ -346,14 +343,14 @@ pub mod tests { #[cfg(all(test, feature = "unstable"))] mod bench { - use super::tests::write_lorem_ipsum_store; - use crate::directory::Directory; - use crate::directory::RamDirectory; - use crate::store::Compressor; - use crate::store::StoreReader; use std::path::Path; + use test::Bencher; + use super::tests::write_lorem_ipsum_store; + use crate::directory::{Directory, RamDirectory}; + use crate::store::{Compressor, StoreReader}; + #[bench] #[cfg(feature = "mmap")] fn bench_store_encode(b: &mut Bencher) { diff --git a/src/store/reader.rs b/src/store/reader.rs index c27c88d15..9ad6e307e 100644 --- a/src/store/reader.rs +++ b/src/store/reader.rs @@ -1,16 +1,20 @@ +use std::io; +use std::sync::atomic::{AtomicUsize, Ordering}; +use std::sync::{Arc, Mutex}; + +use common::{BinarySerializable, HasLen, VInt}; +use lru::LruCache; + +use super::footer::DocStoreFooter; +use super::index::SkipIndex; use super::Compressor; -use super::{footer::DocStoreFooter, index::SkipIndex}; use crate::directory::{FileSlice, OwnedBytes}; +use crate::error::DataCorruption; +use crate::fastfield::AliveBitSet; use crate::schema::Document; use crate::space_usage::StoreSpaceUsage; use crate::store::index::Checkpoint; use crate::DocId; -use crate::{error::DataCorruption, fastfield::AliveBitSet}; -use common::{BinarySerializable, HasLen, VInt}; -use lru::LruCache; -use std::io; -use std::sync::atomic::{AtomicUsize, Ordering}; -use std::sync::{Arc, Mutex}; const LRU_CACHE_CAPACITY: usize = 100; @@ -105,14 +109,13 @@ impl StoreReader { Ok(Document::deserialize(&mut doc_bytes)?) } - /// Reads raw bytes of a given document. Returns `RawDocument`, which contains the block of a document and its start and end - /// position within the block. + /// Reads raw bytes of a given document. Returns `RawDocument`, which contains the block of a + /// document and its start and end position within the block. /// /// Calling `.get(doc)` is relatively costly as it requires /// decompressing a compressed block. The store utilizes a LRU cache, /// so accessing docs from the same compressed block should be faster. /// For that reason a store reader should be kept and reused. - /// pub fn get_document_bytes(&self, doc_id: DocId) -> crate::Result { let checkpoint = self.block_checkpoint(doc_id).ok_or_else(|| { crate::TantivyError::InvalidArgument(format!("Failed to lookup Doc #{}.", doc_id)) @@ -196,7 +199,8 @@ impl StoreReader { let block = block .ok_or_else(|| { DataCorruption::comment_only( - "the current checkpoint in the doc store iterator is none, this should never happen", + "the current checkpoint in the doc store iterator is none, this \ + should never happen", ) })? .map_err(|error_kind| { @@ -237,14 +241,16 @@ impl StoreReader { #[cfg(test)] mod tests { - use super::*; - use crate::schema::Document; - use crate::schema::Field; - use crate::{directory::RamDirectory, store::tests::write_lorem_ipsum_store, Directory}; use std::path::Path; + use super::*; + use crate::directory::RamDirectory; + use crate::schema::{Document, Field}; + use crate::store::tests::write_lorem_ipsum_store; + use crate::Directory; + fn get_text_field<'a>(doc: &'a Document, field: &'a Field) -> Option<&'a str> { - doc.get_first(*field).and_then(|f| f.text()) + doc.get_first(*field).and_then(|f| f.as_text()) } #[test] diff --git a/src/store/writer.rs b/src/store/writer.rs index d7004c0f6..0efdb0fc6 100644 --- a/src/store/writer.rs +++ b/src/store/writer.rs @@ -1,14 +1,15 @@ +use std::io::{self, Write}; + +use common::{BinarySerializable, CountingWriter, VInt}; + +use super::compressors::Compressor; +use super::footer::DocStoreFooter; use super::index::SkipIndexBuilder; use super::StoreReader; -use super::{compressors::Compressor, footer::DocStoreFooter}; -use crate::directory::TerminatingWrite; -use crate::directory::WritePtr; +use crate::directory::{TerminatingWrite, WritePtr}; use crate::schema::Document; use crate::store::index::Checkpoint; use crate::DocId; -use common::CountingWriter; -use common::{BinarySerializable, VInt}; -use std::io::{self, Write}; const BLOCK_SIZE: usize = 16_384; @@ -19,7 +20,6 @@ const BLOCK_SIZE: usize = 16_384; /// as opposed to when the segment is getting finalized. /// /// The skip list index on the other hand, is built in memory. -/// pub struct StoreWriter { compressor: Compressor, doc: DocId, @@ -60,7 +60,6 @@ impl StoreWriter { /// /// The document id is implicitely the current number /// of documents. - /// pub fn store_bytes(&mut self, serialized_document: &[u8]) -> io::Result<()> { let doc_num_bytes = serialized_document.len(); VInt(doc_num_bytes as u64).serialize(&mut self.current_block)?; @@ -76,7 +75,6 @@ impl StoreWriter { /// /// The document id is implicitely the current number /// of documents. - /// pub fn store(&mut self, stored_document: &Document) -> io::Result<()> { self.intermediary_buffer.clear(); stored_document.serialize(&mut self.intermediary_buffer)?; diff --git a/src/termdict/fst_termdict/mod.rs b/src/termdict/fst_termdict/mod.rs index fa3c55a49..2ed4db970 100644 --- a/src/termdict/fst_termdict/mod.rs +++ b/src/termdict/fst_termdict/mod.rs @@ -1,24 +1,23 @@ -/*! -The term dictionary main role is to associate the sorted [`Term`s](../struct.Term.html) to -a [`TermInfo`](../postings/struct.TermInfo.html) struct that contains some meta-information -about the term. - -Internally, the term dictionary relies on the `fst` crate to store -a sorted mapping that associate each term to its rank in the lexicographical order. -For instance, in a dictionary containing the sorted terms "abba", "bjork", "blur" and "donovan", -the `TermOrdinal` are respectively `0`, `1`, `2`, and `3`. - -For `u64`-terms, tantivy explicitely uses a `BigEndian` representation to ensure that the -lexicographical order matches the natural order of integers. - -`i64`-terms are transformed to `u64` using a continuous mapping `val ⟶ val - i64::min_value()` -and then treated as a `u64`. - -`f64`-terms are transformed to `u64` using a mapping that preserve order, and are then treated -as `u64`. - -A second datastructure makes it possible to access a [`TermInfo`](../postings/struct.TermInfo.html). -*/ +//! The term dictionary main role is to associate the sorted [`Term`s](../struct.Term.html) to +//! a [`TermInfo`](../postings/struct.TermInfo.html) struct that contains some meta-information +//! about the term. +//! +//! Internally, the term dictionary relies on the `fst` crate to store +//! a sorted mapping that associate each term to its rank in the lexicographical order. +//! For instance, in a dictionary containing the sorted terms "abba", "bjork", "blur" and "donovan", +//! the `TermOrdinal` are respectively `0`, `1`, `2`, and `3`. +//! +//! For `u64`-terms, tantivy explicitely uses a `BigEndian` representation to ensure that the +//! lexicographical order matches the natural order of integers. +//! +//! `i64`-terms are transformed to `u64` using a continuous mapping `val ⟶ val - i64::min_value()` +//! and then treated as a `u64`. +//! +//! `f64`-terms are transformed to `u64` using a mapping that preserve order, and are then treated +//! as `u64`. +//! +//! A second datastructure makes it possible to access a +//! [`TermInfo`](../postings/struct.TermInfo.html). mod streamer; mod term_info_store; mod termdict; diff --git a/src/termdict/fst_termdict/streamer.rs b/src/termdict/fst_termdict/streamer.rs index 10a72d6dc..fc3bbd66c 100644 --- a/src/termdict/fst_termdict/streamer.rs +++ b/src/termdict/fst_termdict/streamer.rs @@ -1,26 +1,24 @@ use std::io; +use tantivy_fst::automaton::AlwaysMatch; +use tantivy_fst::map::{Stream, StreamBuilder}; +use tantivy_fst::{Automaton, IntoStreamer, Streamer}; + use super::TermDictionary; use crate::postings::TermInfo; use crate::termdict::TermOrdinal; -use tantivy_fst::automaton::AlwaysMatch; -use tantivy_fst::map::{Stream, StreamBuilder}; -use tantivy_fst::Automaton; -use tantivy_fst::{IntoStreamer, Streamer}; /// `TermStreamerBuilder` is a helper object used to define /// a range of terms that should be streamed. pub struct TermStreamerBuilder<'a, A = AlwaysMatch> -where - A: Automaton, +where A: Automaton { fst_map: &'a TermDictionary, stream_builder: StreamBuilder<'a, A>, } impl<'a, A> TermStreamerBuilder<'a, A> -where - A: Automaton, +where A: Automaton { pub(crate) fn new(fst_map: &'a TermDictionary, stream_builder: StreamBuilder<'a, A>) -> Self { TermStreamerBuilder { @@ -75,8 +73,7 @@ where /// `TermStreamer` acts as a cursor over a range of terms of a segment. /// Terms are guaranteed to be sorted. pub struct TermStreamer<'a, A = AlwaysMatch> -where - A: Automaton, +where A: Automaton { pub(crate) fst_map: &'a TermDictionary, pub(crate) stream: Stream<'a, A>, @@ -86,8 +83,7 @@ where } impl<'a, A> TermStreamer<'a, A> -where - A: Automaton, +where A: Automaton { /// Advance position the stream on the next item. /// Before the first call to `.advance()`, the stream diff --git a/src/termdict/fst_termdict/term_info_store.rs b/src/termdict/fst_termdict/term_info_store.rs index 1bde6eb79..2bcea6bf5 100644 --- a/src/termdict/fst_termdict/term_info_store.rs +++ b/src/termdict/fst_termdict/term_info_store.rs @@ -1,12 +1,13 @@ +use std::cmp; +use std::io::{self, Read, Write}; + +use byteorder::{ByteOrder, LittleEndian}; +use common::{BinarySerializable, FixedSize}; +use tantivy_bitpacker::{compute_num_bits, BitPacker}; + use crate::directory::{FileSlice, OwnedBytes}; use crate::postings::TermInfo; use crate::termdict::TermOrdinal; -use byteorder::{ByteOrder, LittleEndian}; -use common::{BinarySerializable, FixedSize}; -use std::cmp; -use std::io::{self, Read, Write}; -use tantivy_bitpacker::compute_num_bits; -use tantivy_bitpacker::BitPacker; const BLOCK_LEN: usize = 256; @@ -287,15 +288,13 @@ impl TermInfoStoreWriter { #[cfg(test)] mod tests { - use super::extract_bits; - use super::TermInfoBlockMeta; - use super::{TermInfoStore, TermInfoStoreWriter}; - use crate::directory::FileSlice; - use crate::postings::TermInfo; use common; use common::BinarySerializable; - use tantivy_bitpacker::compute_num_bits; - use tantivy_bitpacker::BitPacker; + use tantivy_bitpacker::{compute_num_bits, BitPacker}; + + use super::{extract_bits, TermInfoBlockMeta, TermInfoStore, TermInfoStoreWriter}; + use crate::directory::FileSlice; + use crate::postings::TermInfo; #[test] fn test_term_info_block() { diff --git a/src/termdict/fst_termdict/termdict.rs b/src/termdict/fst_termdict/termdict.rs index 078e12054..9469e6ab3 100644 --- a/src/termdict/fst_termdict/termdict.rs +++ b/src/termdict/fst_termdict/termdict.rs @@ -1,14 +1,16 @@ +use std::io::{self, Write}; + +use common::{BinarySerializable, CountingWriter}; +use once_cell::sync::Lazy; +use tantivy_fst::raw::Fst; +use tantivy_fst::Automaton; + use super::term_info_store::{TermInfoStore, TermInfoStoreWriter}; use super::{TermStreamer, TermStreamerBuilder}; use crate::directory::{FileSlice, OwnedBytes}; use crate::error::DataCorruption; use crate::postings::TermInfo; use crate::termdict::TermOrdinal; -use common::{BinarySerializable, CountingWriter}; -use once_cell::sync::Lazy; -use std::io::{self, Write}; -use tantivy_fst::raw::Fst; -use tantivy_fst::Automaton; fn convert_fst_error(e: tantivy_fst::Error) -> io::Error { io::Error::new(io::ErrorKind::Other, e) @@ -24,8 +26,7 @@ pub struct TermDictionaryBuilder { } impl TermDictionaryBuilder -where - W: Write, +where W: Write { /// Creates a new `TermDictionaryBuilder` pub fn create(w: W) -> io::Result { diff --git a/src/termdict/merger.rs b/src/termdict/merger.rs index 9b4cdb2ac..167b5d97d 100644 --- a/src/termdict/merger.rs +++ b/src/termdict/merger.rs @@ -1,12 +1,10 @@ -use crate::postings::TermInfo; -use crate::termdict::TermDictionary; -use crate::termdict::TermOrdinal; -use crate::termdict::TermStreamer; -use tantivy_fst::map::OpBuilder; -use tantivy_fst::map::Union; +use tantivy_fst::map::{OpBuilder, Union}; use tantivy_fst::raw::IndexedValue; use tantivy_fst::Streamer; +use crate::postings::TermInfo; +use crate::termdict::{TermDictionary, TermOrdinal, TermStreamer}; + /// Given a list of sorted term streams, /// returns an iterator over sorted unique terms. /// @@ -23,7 +21,6 @@ pub struct TermMerger<'a> { impl<'a> TermMerger<'a> { /// Stream of merged term dictionary - /// pub fn new(streams: Vec>) -> TermMerger<'a> { let mut op_builder = OpBuilder::new(); let mut dictionaries = vec![]; @@ -98,13 +95,14 @@ impl<'a> TermMerger<'a> { #[cfg(all(test, feature = "unstable"))] mod bench { + use rand::distributions::Alphanumeric; + use rand::{thread_rng, Rng}; + use test::{self, Bencher}; + use super::TermMerger; use crate::directory::FileSlice; use crate::postings::TermInfo; use crate::termdict::{TermDictionary, TermDictionaryBuilder}; - use rand::distributions::Alphanumeric; - use rand::{thread_rng, Rng}; - use test::{self, Bencher}; fn make_term_info(term_ord: u64) -> TermInfo { let offset = |term_ord: u64| (term_ord * 100 + term_ord * term_ord) as usize; diff --git a/src/termdict/mod.rs b/src/termdict/mod.rs index fca690459..a1fe98a02 100644 --- a/src/termdict/mod.rs +++ b/src/termdict/mod.rs @@ -1,24 +1,23 @@ -/*! -The term dictionary main role is to associate the sorted [`Term`s](../struct.Term.html) to -a [`TermInfo`](../postings/struct.TermInfo.html) struct that contains some meta-information -about the term. - -Internally, the term dictionary relies on the `fst` crate to store -a sorted mapping that associate each term to its rank in the lexicographical order. -For instance, in a dictionary containing the sorted terms "abba", "bjork", "blur" and "donovan", -the [TermOrdinal] are respectively `0`, `1`, `2`, and `3`. - -For `u64`-terms, tantivy explicitely uses a `BigEndian` representation to ensure that the -lexicographical order matches the natural order of integers. - -`i64`-terms are transformed to `u64` using a continuous mapping `val ⟶ val - i64::min_value()` -and then treated as a `u64`. - -`f64`-terms are transformed to `u64` using a mapping that preserve order, and are then treated -as `u64`. - -A second datastructure makes it possible to access a [`TermInfo`](../postings/struct.TermInfo.html). -*/ +//! The term dictionary main role is to associate the sorted [`Term`s](../struct.Term.html) to +//! a [`TermInfo`](../postings/struct.TermInfo.html) struct that contains some meta-information +//! about the term. +//! +//! Internally, the term dictionary relies on the `fst` crate to store +//! a sorted mapping that associate each term to its rank in the lexicographical order. +//! For instance, in a dictionary containing the sorted terms "abba", "bjork", "blur" and "donovan", +//! the [TermOrdinal] are respectively `0`, `1`, `2`, and `3`. +//! +//! For `u64`-terms, tantivy explicitely uses a `BigEndian` representation to ensure that the +//! lexicographical order matches the natural order of integers. +//! +//! `i64`-terms are transformed to `u64` using a continuous mapping `val ⟶ val - i64::min_value()` +//! and then treated as a `u64`. +//! +//! `f64`-terms are transformed to `u64` using a mapping that preserve order, and are then treated +//! as `u64`. +//! +//! A second datastructure makes it possible to access a +//! [`TermInfo`](../postings/struct.TermInfo.html). mod fst_termdict; use fst_termdict as termdict; @@ -26,9 +25,7 @@ use fst_termdict as termdict; mod merger; pub use self::merger::TermMerger; -pub use self::termdict::TermDictionary; -pub use self::termdict::TermDictionaryBuilder; -pub use self::termdict::TermStreamer; +pub use self::termdict::{TermDictionary, TermDictionaryBuilder, TermStreamer}; /// Position of the term in the sorted list of terms. pub type TermOrdinal = u64; diff --git a/src/termdict/tests.rs b/src/termdict/tests.rs index 4fb8faacc..2a649ca9a 100644 --- a/src/termdict/tests.rs +++ b/src/termdict/tests.rs @@ -1,11 +1,10 @@ -use super::{TermDictionary, TermDictionaryBuilder, TermStreamer}; - -use crate::directory::{Directory, FileSlice, RamDirectory, TerminatingWrite}; -use crate::postings::TermInfo; - use std::path::PathBuf; use std::str; +use super::{TermDictionary, TermDictionaryBuilder, TermStreamer}; +use crate::directory::{Directory, FileSlice, RamDirectory, TerminatingWrite}; +use crate::postings::TermInfo; + const BLOCK_SIZE: usize = 1_500; fn make_term_info(term_ord: u64) -> TermInfo { @@ -390,9 +389,10 @@ fn test_stream_term_ord() -> crate::Result<()> { #[test] fn test_automaton_search() -> crate::Result<()> { - use crate::query::DfaWrapper; use levenshtein_automata::LevenshteinAutomatonBuilder; + use crate::query::DfaWrapper; + const COUNTRIES: [&str; 7] = [ "San Marino", "Serbia", diff --git a/src/tokenizer/ascii_folding_filter.rs b/src/tokenizer/ascii_folding_filter.rs index 59bcc5f27..542519e12 100644 --- a/src/tokenizer/ascii_folding_filter.rs +++ b/src/tokenizer/ascii_folding_filter.rs @@ -1,6 +1,7 @@ -use super::{BoxTokenStream, Token, TokenFilter, TokenStream}; use std::mem; +use super::{BoxTokenStream, Token, TokenFilter, TokenStream}; + /// This class converts alphabetic, numeric, and symbolic Unicode characters /// which are not in the first 127 ASCII characters (the "Basic Latin" Unicode /// block) into their ASCII equivalents, if one exists. @@ -1540,13 +1541,11 @@ fn to_ascii(text: &mut String, output: &mut String) { #[cfg(test)] mod tests { - use super::to_ascii; - use crate::tokenizer::AsciiFoldingFilter; - use crate::tokenizer::RawTokenizer; - use crate::tokenizer::SimpleTokenizer; - use crate::tokenizer::TextAnalyzer; use std::iter; + use super::to_ascii; + use crate::tokenizer::{AsciiFoldingFilter, RawTokenizer, SimpleTokenizer, TextAnalyzer}; + #[test] fn test_ascii_folding() { assert_eq!(&folding_helper("Ràmon"), &["Ramon"]); diff --git a/src/tokenizer/lower_caser.rs b/src/tokenizer/lower_caser.rs index da878220f..3373948e3 100644 --- a/src/tokenizer/lower_caser.rs +++ b/src/tokenizer/lower_caser.rs @@ -1,6 +1,7 @@ +use std::mem; + use super::{Token, TokenFilter, TokenStream}; use crate::tokenizer::BoxTokenStream; -use std::mem; impl TokenFilter for LowerCaser { fn transform<'a>(&self, token_stream: BoxTokenStream<'a>) -> BoxTokenStream<'a> { diff --git a/src/tokenizer/mod.rs b/src/tokenizer/mod.rs index 95178acd1..a3af31ad5 100644 --- a/src/tokenizer/mod.rs +++ b/src/tokenizer/mod.rs @@ -88,7 +88,8 @@ //! If you built your schema programmatically, a complete example //! could like this for instance. //! -//! Note that tokens with a len greater or equal to [`MAX_TOKEN_LEN`](./constant.MAX_TOKEN_LEN.html). +//! Note that tokens with a len greater or equal to +//! [`MAX_TOKEN_LEN`](./constant.MAX_TOKEN_LEN.html). //! //! # Example //! @@ -116,7 +117,6 @@ //! .tokenizers() //! .register("custom_en", custom_en_tokenizer); //! ``` -//! mod alphanum_only; mod ascii_folding_filter; mod facet_tokenizer; @@ -144,14 +144,12 @@ pub use self::simple_tokenizer::SimpleTokenizer; pub use self::stemmer::{Language, Stemmer}; pub use self::stop_word_filter::StopWordFilter; pub(crate) use self::token_stream_chain::TokenStreamChain; -pub use self::whitespace_tokenizer::WhitespaceTokenizer; - pub use self::tokenized_string::{PreTokenizedStream, PreTokenizedString}; pub use self::tokenizer::{ BoxTokenFilter, BoxTokenStream, TextAnalyzer, Token, TokenFilter, TokenStream, Tokenizer, }; - pub use self::tokenizer_manager::TokenizerManager; +pub use self::whitespace_tokenizer::WhitespaceTokenizer; /// Maximum authorized len (in bytes) for a token. /// diff --git a/src/tokenizer/ngram_tokenizer.rs b/src/tokenizer/ngram_tokenizer.rs index bd9dbf097..b268d6105 100644 --- a/src/tokenizer/ngram_tokenizer.rs +++ b/src/tokenizer/ngram_tokenizer.rs @@ -21,7 +21,8 @@ use crate::tokenizer::BoxTokenStream; /// | Position | 0 | 0 | 0 | 0 | /// | Offsets | 0,2 | 0,3 | 0,4 | 0,5 | /// -/// Example 3: `hεllo` (non-ascii) would be tokenized as (min_gram: 2, max_gram: 5, prefix_only: **true**) +/// Example 3: `hεllo` (non-ascii) would be tokenized as (min_gram: 2, max_gram: 5, prefix_only: +/// **true**) /// /// | Term | hε | hεl | hεll | hεllo | /// |----------|-----|-----|-------|-------| @@ -191,8 +192,7 @@ struct StutteringIterator { } impl StutteringIterator -where - T: Iterator, +where T: Iterator { pub fn new(mut underlying: T, min_gram: usize, max_gram: usize) -> StutteringIterator { assert!(min_gram > 0); @@ -221,8 +221,7 @@ where } impl Iterator for StutteringIterator -where - T: Iterator, +where T: Iterator { type Item = (usize, usize); @@ -302,10 +301,7 @@ fn utf8_codepoint_width(b: u8) -> usize { #[cfg(test)] mod tests { - use super::utf8_codepoint_width; - use super::CodepointFrontiers; - use super::NgramTokenizer; - use super::StutteringIterator; + use super::{utf8_codepoint_width, CodepointFrontiers, NgramTokenizer, StutteringIterator}; use crate::tokenizer::tests::assert_token; use crate::tokenizer::tokenizer::Tokenizer; use crate::tokenizer::{BoxTokenStream, Token}; diff --git a/src/tokenizer/remove_long.rs b/src/tokenizer/remove_long.rs index dae685392..173291904 100644 --- a/src/tokenizer/remove_long.rs +++ b/src/tokenizer/remove_long.rs @@ -11,7 +11,6 @@ //! assert_eq!(stream.next().unwrap().text, "nice"); //! assert!(stream.next().is_none()); //! ``` -//! use super::{Token, TokenFilter, TokenStream}; use crate::tokenizer::BoxTokenStream; diff --git a/src/tokenizer/simple_tokenizer.rs b/src/tokenizer/simple_tokenizer.rs index 1c17e4e2a..153f245c9 100644 --- a/src/tokenizer/simple_tokenizer.rs +++ b/src/tokenizer/simple_tokenizer.rs @@ -1,7 +1,7 @@ -use super::BoxTokenStream; -use super::{Token, TokenStream, Tokenizer}; use std::str::CharIndices; +use super::{BoxTokenStream, Token, TokenStream, Tokenizer}; + /// Tokenize the text by splitting on whitespaces and punctuation. #[derive(Clone)] pub struct SimpleTokenizer; diff --git a/src/tokenizer/stemmer.rs b/src/tokenizer/stemmer.rs index 8facade1a..361a89410 100644 --- a/src/tokenizer/stemmer.rs +++ b/src/tokenizer/stemmer.rs @@ -1,8 +1,9 @@ -use super::{Token, TokenFilter, TokenStream}; -use crate::tokenizer::BoxTokenStream; use rust_stemmers::{self, Algorithm}; use serde::{Deserialize, Serialize}; +use super::{Token, TokenFilter, TokenStream}; +use crate::tokenizer::BoxTokenStream; + /// Available stemmer languages. #[derive(Debug, Serialize, Deserialize, Eq, PartialEq, Copy, Clone)] #[allow(missing_docs)] diff --git a/src/tokenizer/stop_word_filter.rs b/src/tokenizer/stop_word_filter.rs index 31f72ca25..cf0a3b638 100644 --- a/src/tokenizer/stop_word_filter.rs +++ b/src/tokenizer/stop_word_filter.rs @@ -10,12 +10,14 @@ //! assert_eq!(stream.next().unwrap().text, "crafty"); //! assert!(stream.next().is_none()); //! ``` -use super::{Token, TokenFilter, TokenStream}; -use crate::tokenizer::BoxTokenStream; -use fnv::FnvHasher; use std::collections::HashSet; use std::hash::BuildHasherDefault; +use fnv::FnvHasher; + +use super::{Token, TokenFilter, TokenStream}; +use crate::tokenizer::BoxTokenStream; + // configure our hashers for SPEED type StopWordHasher = BuildHasherDefault; type StopWordHashSet = HashSet; diff --git a/src/tokenizer/token_stream_chain.rs b/src/tokenizer/token_stream_chain.rs index cf8ef0206..d5e669dec 100644 --- a/src/tokenizer/token_stream_chain.rs +++ b/src/tokenizer/token_stream_chain.rs @@ -1,6 +1,7 @@ -use crate::tokenizer::{BoxTokenStream, Token, TokenStream}; use std::ops::DerefMut; +use crate::tokenizer::{BoxTokenStream, Token, TokenStream}; + const POSITION_GAP: usize = 2; pub(crate) struct TokenStreamChain<'a> { @@ -67,8 +68,7 @@ impl<'a> TokenStream for TokenStreamChain<'a> { #[cfg(test)] mod tests { use super::super::{SimpleTokenizer, TokenStream, Tokenizer}; - use super::TokenStreamChain; - use super::POSITION_GAP; + use super::{TokenStreamChain, POSITION_GAP}; #[test] fn test_chain_first_emits_no_tokens() { diff --git a/src/tokenizer/tokenized_string.rs b/src/tokenizer/tokenized_string.rs index e091c6018..86c4f9907 100644 --- a/src/tokenizer/tokenized_string.rs +++ b/src/tokenizer/tokenized_string.rs @@ -1,7 +1,9 @@ -use crate::tokenizer::{BoxTokenStream, Token, TokenStream, TokenStreamChain}; -use serde::{Deserialize, Serialize}; use std::cmp::Ordering; +use serde::{Deserialize, Serialize}; + +use crate::tokenizer::{BoxTokenStream, Token, TokenStream, TokenStreamChain}; + /// Struct representing pre-tokenized text #[derive(Debug, Clone, Serialize, Deserialize, Eq, PartialEq)] pub struct PreTokenizedString { @@ -91,7 +93,6 @@ impl TokenStream for PreTokenizedStream { mod tests { use super::*; - use crate::tokenizer::Token; #[test] diff --git a/src/tokenizer/tokenizer.rs b/src/tokenizer/tokenizer.rs index cd4b0c222..82056a07c 100644 --- a/src/tokenizer/tokenizer.rs +++ b/src/tokenizer/tokenizer.rs @@ -1,10 +1,12 @@ -use crate::tokenizer::TokenStreamChain; -use serde::{Deserialize, Serialize}; /// The tokenizer module contains all of the tools used to process /// text in `tantivy`. use std::borrow::{Borrow, BorrowMut}; use std::ops::{Deref, DerefMut}; +use serde::{Deserialize, Serialize}; + +use crate::tokenizer::TokenStreamChain; + /// Token #[derive(Debug, Clone, Serialize, Deserialize, Eq, PartialEq)] pub struct Token { @@ -76,7 +78,6 @@ impl TextAnalyzer { /// .filter(LowerCaser) /// .filter(Stemmer::default()); /// ``` - /// pub fn filter>(mut self, token_filter: F) -> Self { self.token_filters.push(token_filter.into()); self @@ -176,8 +177,7 @@ impl<'a> TokenStream for Box { pub struct BoxTokenStream<'a>(Box); impl<'a, T> From for BoxTokenStream<'a> -where - T: TokenStream + 'a, +where T: TokenStream + 'a { fn from(token_stream: T) -> BoxTokenStream<'a> { BoxTokenStream(Box::new(token_stream)) @@ -244,7 +244,6 @@ impl From for BoxTokenFilter { /// assert_eq!(token.position, 1); /// } /// ``` -/// pub trait TokenStream { /// Advance to the next token /// diff --git a/src/tokenizer/tokenizer_manager.rs b/src/tokenizer/tokenizer_manager.rs index 07f2e7ae1..fefa5e416 100644 --- a/src/tokenizer/tokenizer_manager.rs +++ b/src/tokenizer/tokenizer_manager.rs @@ -1,14 +1,12 @@ -use crate::tokenizer::stemmer::Language; -use crate::tokenizer::tokenizer::TextAnalyzer; -use crate::tokenizer::LowerCaser; -use crate::tokenizer::RawTokenizer; -use crate::tokenizer::RemoveLongFilter; -use crate::tokenizer::SimpleTokenizer; -use crate::tokenizer::Stemmer; -use crate::tokenizer::WhitespaceTokenizer; use std::collections::HashMap; use std::sync::{Arc, RwLock}; +use crate::tokenizer::stemmer::Language; +use crate::tokenizer::tokenizer::TextAnalyzer; +use crate::tokenizer::{ + LowerCaser, RawTokenizer, RemoveLongFilter, SimpleTokenizer, Stemmer, WhitespaceTokenizer, +}; + /// The tokenizer manager serves as a store for /// all of the pre-configured tokenizer pipelines. /// @@ -29,9 +27,7 @@ pub struct TokenizerManager { impl TokenizerManager { /// Registers a new tokenizer associated with a given name. pub fn register(&self, tokenizer_name: &str, tokenizer: T) - where - TextAnalyzer: From, - { + where TextAnalyzer: From { let boxed_tokenizer: TextAnalyzer = TextAnalyzer::from(tokenizer); self.tokenizers .write() diff --git a/src/tokenizer/whitespace_tokenizer.rs b/src/tokenizer/whitespace_tokenizer.rs index 6daed4f3f..6d7b7369a 100644 --- a/src/tokenizer/whitespace_tokenizer.rs +++ b/src/tokenizer/whitespace_tokenizer.rs @@ -1,7 +1,7 @@ -use super::BoxTokenStream; -use super::{Token, TokenStream, Tokenizer}; use std::str::CharIndices; +use super::{BoxTokenStream, Token, TokenStream, Tokenizer}; + /// Tokenize the text by splitting on whitespaces. #[derive(Clone)] pub struct WhitespaceTokenizer; diff --git a/tests/failpoints/mod.rs b/tests/failpoints/mod.rs index 0e33313db..3f5e362ce 100644 --- a/tests/failpoints/mod.rs +++ b/tests/failpoints/mod.rs @@ -1,8 +1,8 @@ use std::path::Path; + use tantivy::directory::{Directory, ManagedDirectory, RamDirectory, TerminatingWrite}; -use tantivy::doc; use tantivy::schema::{Schema, TEXT}; -use tantivy::{Index, Term}; +use tantivy::{doc, Index, Term}; #[test] fn test_failpoints_managed_directory_gc_if_delete_fails() {