diff --git a/bitpacker/src/lib.rs b/bitpacker/src/lib.rs index 1697a8488..141fe66a5 100644 --- a/bitpacker/src/lib.rs +++ b/bitpacker/src/lib.rs @@ -50,3 +50,32 @@ where } None } + +#[test] +fn test_compute_num_bits() { + assert_eq!(compute_num_bits(1), 1u8); + assert_eq!(compute_num_bits(0), 0u8); + assert_eq!(compute_num_bits(2), 2u8); + assert_eq!(compute_num_bits(3), 2u8); + assert_eq!(compute_num_bits(4), 3u8); + assert_eq!(compute_num_bits(255), 8u8); + assert_eq!(compute_num_bits(256), 9u8); + assert_eq!(compute_num_bits(5_000_000_000), 33u8); +} + +#[test] +fn test_minmax_empty() { + let vals: Vec = vec![]; + assert_eq!(minmax(vals.into_iter()), None); +} + +#[test] +fn test_minmax_one() { + assert_eq!(minmax(vec![1].into_iter()), Some((1, 1))); +} + +#[test] +fn test_minmax_two() { + assert_eq!(minmax(vec![1, 2].into_iter()), Some((1, 2))); + assert_eq!(minmax(vec![2, 1].into_iter()), Some((1, 2))); +} diff --git a/common/Cargo.toml b/common/Cargo.toml index b9f09b950..94b40a459 100644 --- a/common/Cargo.toml +++ b/common/Cargo.toml @@ -12,4 +12,5 @@ description = "common traits and utility functions used by multiple tantivy subc byteorder = "1.4.3" [dev-dependencies] +proptest = "1.0.0" rand = "0.8.4" diff --git a/common/src/lib.rs b/common/src/lib.rs index 46b2e0222..ef95ce659 100644 --- a/common/src/lib.rs +++ b/common/src/lib.rs @@ -1,3 +1,5 @@ +use std::ops::Deref; + pub use byteorder::LittleEndian as Endianness; mod bitset; @@ -9,3 +11,157 @@ pub use bitset::*; pub use serialize::{BinarySerializable, DeserializeFrom, FixedSize}; pub use vint::{read_u32_vint, read_u32_vint_no_advance, serialize_vint_u32, write_u32_vint, VInt}; pub use writer::{AntiCallToken, CountingWriter, TerminatingWrite}; + +/// Has length trait +pub trait HasLen { + /// Return length + fn len(&self) -> usize; + + /// Returns true iff empty. + fn is_empty(&self) -> bool { + self.len() == 0 + } +} + +impl> HasLen for T { + fn len(&self) -> usize { + self.deref().len() + } +} + +const HIGHEST_BIT: u64 = 1 << 63; + +/// Maps a `i64` to `u64` +/// +/// For simplicity, tantivy internally handles `i64` as `u64`. +/// The mapping is defined by this function. +/// +/// Maps `i64` to `u64` so that +/// `-2^63 .. 2^63-1` is mapped +/// to +/// `0 .. 2^64-1` +/// in that order. +/// +/// This is more suited than simply casting (`val as u64`) +/// because of bitpacking. +/// +/// Imagine a list of `i64` ranging from -10 to 10. +/// When casting negative values, the negative values are projected +/// to values over 2^63, and all values end up requiring 64 bits. +/// +/// # See also +/// The [reverse mapping is `u64_to_i64`](./fn.u64_to_i64.html). +#[inline] +pub fn i64_to_u64(val: i64) -> u64 { + (val as u64) ^ HIGHEST_BIT +} + +/// Reverse the mapping given by [`i64_to_u64`](./fn.i64_to_u64.html). +#[inline] +pub fn u64_to_i64(val: u64) -> i64 { + (val ^ HIGHEST_BIT) as i64 +} + +/// Maps a `f64` to `u64` +/// +/// For simplicity, tantivy internally handles `f64` as `u64`. +/// The mapping is defined by this function. +/// +/// Maps `f64` to `u64` in a monotonic manner, so that bytes lexical order is preserved. +/// +/// This is more suited than simply casting (`val as u64`) +/// which would truncate the result +/// +/// # Reference +/// +/// Daniel Lemire's [blog post](https://lemire.me/blog/2020/12/14/converting-floating-point-numbers-to-integers-while-preserving-order/) +/// explains the mapping in a clear manner. +/// +/// # See also +/// The [reverse mapping is `u64_to_f64`](./fn.u64_to_f64.html). +#[inline] +pub fn f64_to_u64(val: f64) -> u64 { + let bits = val.to_bits(); + if val.is_sign_positive() { + bits ^ HIGHEST_BIT + } else { + !bits + } +} + +/// Reverse the mapping given by [`i64_to_u64`](./fn.i64_to_u64.html). +#[inline] +pub fn u64_to_f64(val: u64) -> f64 { + f64::from_bits(if val & HIGHEST_BIT != 0 { + val ^ HIGHEST_BIT + } else { + !val + }) +} + +#[cfg(test)] +pub mod test { + + use super::{f64_to_u64, i64_to_u64, u64_to_f64, u64_to_i64}; + use super::{BinarySerializable, FixedSize}; + use proptest::prelude::*; + use std::f64; + + fn test_i64_converter_helper(val: i64) { + assert_eq!(u64_to_i64(i64_to_u64(val)), val); + } + + fn test_f64_converter_helper(val: f64) { + assert_eq!(u64_to_f64(f64_to_u64(val)), val); + } + + pub fn fixed_size_test() { + let mut buffer = Vec::new(); + O::default().serialize(&mut buffer).unwrap(); + assert_eq!(buffer.len(), O::SIZE_IN_BYTES); + } + + proptest! { + #[test] + fn test_f64_converter_monotonicity_proptest((left, right) in (proptest::num::f64::NORMAL, proptest::num::f64::NORMAL)) { + let left_u64 = f64_to_u64(left); + let right_u64 = f64_to_u64(right); + assert_eq!(left_u64 < right_u64, left < right); + } + } + + #[test] + fn test_i64_converter() { + assert_eq!(i64_to_u64(i64::min_value()), u64::min_value()); + assert_eq!(i64_to_u64(i64::max_value()), u64::max_value()); + test_i64_converter_helper(0i64); + test_i64_converter_helper(i64::min_value()); + test_i64_converter_helper(i64::max_value()); + for i in -1000i64..1000i64 { + test_i64_converter_helper(i); + } + } + + #[test] + fn test_f64_converter() { + test_f64_converter_helper(f64::INFINITY); + test_f64_converter_helper(f64::NEG_INFINITY); + test_f64_converter_helper(0.0); + test_f64_converter_helper(-0.0); + test_f64_converter_helper(1.0); + test_f64_converter_helper(-1.0); + } + + #[test] + fn test_f64_order() { + assert!(!(f64_to_u64(f64::NEG_INFINITY)..f64_to_u64(f64::INFINITY)) + .contains(&f64_to_u64(f64::NAN))); //nan is not a number + assert!(f64_to_u64(1.5) > f64_to_u64(1.0)); //same exponent, different mantissa + assert!(f64_to_u64(2.0) > f64_to_u64(1.0)); //same mantissa, different exponent + assert!(f64_to_u64(2.0) > f64_to_u64(1.5)); //different exponent and mantissa + assert!(f64_to_u64(1.0) > f64_to_u64(-1.0)); // pos > neg + assert!(f64_to_u64(-1.5) < f64_to_u64(-1.0)); + assert!(f64_to_u64(-2.0) < f64_to_u64(1.0)); + assert!(f64_to_u64(-2.0) < f64_to_u64(-1.5)); + } +} diff --git a/src/common/mod.rs b/src/common/mod.rs deleted file mode 100644 index cb1f6949b..000000000 --- a/src/common/mod.rs +++ /dev/null @@ -1,197 +0,0 @@ -pub use byteorder::LittleEndian as Endianness; -pub use common::CountingWriter; -pub use common::{ - read_u32_vint, read_u32_vint_no_advance, serialize_vint_u32, write_u32_vint, VInt, -}; -pub use common::{BinarySerializable, DeserializeFrom, FixedSize}; - -/// Segment's max doc must be `< MAX_DOC_LIMIT`. -/// -/// We do not allow segments with more than -pub const MAX_DOC_LIMIT: u32 = 1 << 31; - -/// Has length trait -pub trait HasLen { - /// Return length - fn len(&self) -> usize; - - /// Returns true iff empty. - fn is_empty(&self) -> bool { - self.len() == 0 - } -} - -const HIGHEST_BIT: u64 = 1 << 63; - -/// Maps a `i64` to `u64` -/// -/// For simplicity, tantivy internally handles `i64` as `u64`. -/// The mapping is defined by this function. -/// -/// Maps `i64` to `u64` so that -/// `-2^63 .. 2^63-1` is mapped -/// to -/// `0 .. 2^64-1` -/// in that order. -/// -/// This is more suited than simply casting (`val as u64`) -/// because of bitpacking. -/// -/// Imagine a list of `i64` ranging from -10 to 10. -/// When casting negative values, the negative values are projected -/// to values over 2^63, and all values end up requiring 64 bits. -/// -/// # See also -/// The [reverse mapping is `u64_to_i64`](./fn.u64_to_i64.html). -#[inline] -pub fn i64_to_u64(val: i64) -> u64 { - (val as u64) ^ HIGHEST_BIT -} - -/// Reverse the mapping given by [`i64_to_u64`](./fn.i64_to_u64.html). -#[inline] -pub fn u64_to_i64(val: u64) -> i64 { - (val ^ HIGHEST_BIT) as i64 -} - -/// Maps a `f64` to `u64` -/// -/// For simplicity, tantivy internally handles `f64` as `u64`. -/// The mapping is defined by this function. -/// -/// Maps `f64` to `u64` in a monotonic manner, so that bytes lexical order is preserved. -/// -/// This is more suited than simply casting (`val as u64`) -/// which would truncate the result -/// -/// # Reference -/// -/// Daniel Lemire's [blog post](https://lemire.me/blog/2020/12/14/converting-floating-point-numbers-to-integers-while-preserving-order/) -/// explains the mapping in a clear manner. -/// -/// # See also -/// The [reverse mapping is `u64_to_f64`](./fn.u64_to_f64.html). -#[inline] -pub fn f64_to_u64(val: f64) -> u64 { - let bits = val.to_bits(); - if val.is_sign_positive() { - bits ^ HIGHEST_BIT - } else { - !bits - } -} - -/// Reverse the mapping given by [`i64_to_u64`](./fn.i64_to_u64.html). -#[inline] -pub fn u64_to_f64(val: u64) -> f64 { - f64::from_bits(if val & HIGHEST_BIT != 0 { - val ^ HIGHEST_BIT - } else { - !val - }) -} - -#[cfg(test)] -pub(crate) mod test { - - use super::{f64_to_u64, i64_to_u64, u64_to_f64, u64_to_i64}; - use common::{BinarySerializable, FixedSize}; - use proptest::prelude::*; - use std::f64; - use tantivy_bitpacker::compute_num_bits; - pub use tantivy_bitpacker::minmax; - - fn test_i64_converter_helper(val: i64) { - assert_eq!(u64_to_i64(i64_to_u64(val)), val); - } - - fn test_f64_converter_helper(val: f64) { - assert_eq!(u64_to_f64(f64_to_u64(val)), val); - } - - pub fn fixed_size_test() { - let mut buffer = Vec::new(); - O::default().serialize(&mut buffer).unwrap(); - assert_eq!(buffer.len(), O::SIZE_IN_BYTES); - } - - proptest! { - #[test] - fn test_f64_converter_monotonicity_proptest((left, right) in (proptest::num::f64::NORMAL, proptest::num::f64::NORMAL)) { - let left_u64 = f64_to_u64(left); - let right_u64 = f64_to_u64(right); - assert_eq!(left_u64 < right_u64, left < right); - } - } - - #[test] - fn test_i64_converter() { - assert_eq!(i64_to_u64(i64::min_value()), u64::min_value()); - assert_eq!(i64_to_u64(i64::max_value()), u64::max_value()); - test_i64_converter_helper(0i64); - test_i64_converter_helper(i64::min_value()); - test_i64_converter_helper(i64::max_value()); - for i in -1000i64..1000i64 { - test_i64_converter_helper(i); - } - } - - #[test] - fn test_f64_converter() { - test_f64_converter_helper(f64::INFINITY); - test_f64_converter_helper(f64::NEG_INFINITY); - test_f64_converter_helper(0.0); - test_f64_converter_helper(-0.0); - test_f64_converter_helper(1.0); - test_f64_converter_helper(-1.0); - } - - #[test] - fn test_f64_order() { - assert!(!(f64_to_u64(f64::NEG_INFINITY)..f64_to_u64(f64::INFINITY)) - .contains(&f64_to_u64(f64::NAN))); //nan is not a number - assert!(f64_to_u64(1.5) > f64_to_u64(1.0)); //same exponent, different mantissa - assert!(f64_to_u64(2.0) > f64_to_u64(1.0)); //same mantissa, different exponent - assert!(f64_to_u64(2.0) > f64_to_u64(1.5)); //different exponent and mantissa - assert!(f64_to_u64(1.0) > f64_to_u64(-1.0)); // pos > neg - assert!(f64_to_u64(-1.5) < f64_to_u64(-1.0)); - assert!(f64_to_u64(-2.0) < f64_to_u64(1.0)); - assert!(f64_to_u64(-2.0) < f64_to_u64(-1.5)); - } - - #[test] - fn test_compute_num_bits() { - assert_eq!(compute_num_bits(1), 1u8); - assert_eq!(compute_num_bits(0), 0u8); - assert_eq!(compute_num_bits(2), 2u8); - assert_eq!(compute_num_bits(3), 2u8); - assert_eq!(compute_num_bits(4), 3u8); - assert_eq!(compute_num_bits(255), 8u8); - assert_eq!(compute_num_bits(256), 9u8); - assert_eq!(compute_num_bits(5_000_000_000), 33u8); - } - - #[test] - fn test_max_doc() { - // this is the first time I write a unit test for a constant. - assert!(((super::MAX_DOC_LIMIT - 1) as i32) >= 0); - assert!((super::MAX_DOC_LIMIT as i32) < 0); - } - - #[test] - fn test_minmax_empty() { - let vals: Vec = vec![]; - assert_eq!(minmax(vals.into_iter()), None); - } - - #[test] - fn test_minmax_one() { - assert_eq!(minmax(vec![1].into_iter()), Some((1, 1))); - } - - #[test] - fn test_minmax_two() { - assert_eq!(minmax(vec![1, 2].into_iter()), Some((1, 2))); - assert_eq!(minmax(vec![2, 1].into_iter()), Some((1, 2))); - } -} diff --git a/src/core/inverted_index_reader.rs b/src/core/inverted_index_reader.rs index 3371de1c3..e710e8ff9 100644 --- a/src/core/inverted_index_reader.rs +++ b/src/core/inverted_index_reader.rs @@ -1,6 +1,5 @@ use std::io; -use crate::common::BinarySerializable; use crate::directory::FileSlice; use crate::positions::PositionReader; use crate::postings::TermInfo; @@ -8,6 +7,7 @@ use crate::postings::{BlockSegmentPostings, SegmentPostings}; use crate::schema::IndexRecordOption; use crate::schema::Term; use crate::termdict::TermDictionary; +use common::BinarySerializable; /// The inverted index reader is in charge of accessing /// the inverted index associated to a specific field. diff --git a/src/directory/composite_file.rs b/src/directory/composite_file.rs index 5619f803c..6d542609b 100644 --- a/src/directory/composite_file.rs +++ b/src/directory/composite_file.rs @@ -1,12 +1,12 @@ -use crate::common::BinarySerializable; -use crate::common::CountingWriter; -use crate::common::HasLen; -use crate::common::VInt; use crate::directory::FileSlice; use crate::directory::{TerminatingWrite, WritePtr}; use crate::schema::Field; use crate::space_usage::FieldUsage; use crate::space_usage::PerFieldSpaceUsage; +use common::BinarySerializable; +use common::CountingWriter; +use common::HasLen; +use common::VInt; use std::collections::HashMap; use std::io::{self, Read, Write}; use std::iter::ExactSizeIterator; @@ -187,10 +187,10 @@ impl CompositeFile { mod test { use super::{CompositeFile, CompositeWrite}; - use crate::common::BinarySerializable; - use crate::common::VInt; use crate::directory::{Directory, RamDirectory}; use crate::schema::Field; + use common::BinarySerializable; + use common::VInt; use std::io::Write; use std::path::Path; diff --git a/src/directory/file_slice.rs b/src/directory/file_slice.rs index 1a159ebfa..cd267da16 100644 --- a/src/directory/file_slice.rs +++ b/src/directory/file_slice.rs @@ -1,7 +1,7 @@ use stable_deref_trait::StableDeref; -use crate::common::HasLen; use crate::directory::OwnedBytes; +use common::HasLen; use std::fmt; use std::ops::Range; use std::sync::{Arc, Weak}; @@ -32,12 +32,6 @@ impl FileHandle for &'static [u8] { } } -impl> HasLen for T { - fn len(&self) -> usize { - self.deref().len() - } -} - impl From for FileSlice where B: StableDeref + Deref + 'static + Send + Sync, @@ -178,7 +172,7 @@ impl HasLen for FileSlice { #[cfg(test)] mod tests { use super::{FileHandle, FileSlice}; - use crate::common::HasLen; + use common::HasLen; use std::io; #[test] diff --git a/src/directory/footer.rs b/src/directory/footer.rs index 79eaa53e9..590088791 100644 --- a/src/directory/footer.rs +++ b/src/directory/footer.rs @@ -1,10 +1,10 @@ use crate::directory::error::Incompatibility; use crate::directory::FileSlice; use crate::{ - common::{BinarySerializable, CountingWriter, DeserializeFrom, FixedSize, HasLen}, directory::{AntiCallToken, TerminatingWrite}, Version, INDEX_FORMAT_VERSION, }; +use common::{BinarySerializable, CountingWriter, DeserializeFrom, FixedSize, HasLen}; use crc32fast::Hasher; use serde::{Deserialize, Serialize}; use std::io; @@ -156,10 +156,8 @@ mod tests { use crate::directory::footer::Footer; use crate::directory::OwnedBytes; - use crate::{ - common::BinarySerializable, - directory::{footer::FOOTER_MAGIC_NUMBER, FileSlice}, - }; + use crate::directory::{footer::FOOTER_MAGIC_NUMBER, FileSlice}; + use common::BinarySerializable; use std::io; #[test] diff --git a/src/directory/mmap_directory.rs b/src/directory/mmap_directory.rs index 4ec0eda9b..397a050aa 100644 --- a/src/directory/mmap_directory.rs +++ b/src/directory/mmap_directory.rs @@ -485,13 +485,14 @@ mod tests { // The following tests are specific to the MmapDirectory use super::*; + use crate::indexer::LogMergePolicy; use crate::Index; use crate::ReloadPolicy; - use crate::{common::HasLen, indexer::LogMergePolicy}; use crate::{ schema::{Schema, SchemaBuilder, TEXT}, IndexSettings, }; + use common::HasLen; #[test] fn test_open_non_existent_path() { diff --git a/src/directory/ram_directory.rs b/src/directory/ram_directory.rs index 3a3f38e06..29b9042fd 100644 --- a/src/directory/ram_directory.rs +++ b/src/directory/ram_directory.rs @@ -1,9 +1,10 @@ +use crate::core::META_FILEPATH; use crate::directory::error::{DeleteError, OpenReadError, OpenWriteError}; use crate::directory::AntiCallToken; use crate::directory::WatchCallbackList; use crate::directory::{Directory, FileSlice, WatchCallback, WatchHandle}; use crate::directory::{TerminatingWrite, WritePtr}; -use crate::{common::HasLen, core::META_FILEPATH}; +use common::HasLen; use fail::fail_point; use std::collections::HashMap; use std::fmt; diff --git a/src/fastfield/delete.rs b/src/fastfield/delete.rs index fbf3c33dd..421761d63 100644 --- a/src/fastfield/delete.rs +++ b/src/fastfield/delete.rs @@ -1,10 +1,10 @@ -use crate::common::HasLen; -use common::BitSet; use crate::directory::FileSlice; use crate::directory::OwnedBytes; use crate::directory::WritePtr; use crate::space_usage::ByteCount; use crate::DocId; +use common::BitSet; +use common::HasLen; use std::io; use std::io::Write; @@ -111,7 +111,7 @@ impl HasLen for DeleteBitSet { #[cfg(test)] mod tests { use super::DeleteBitSet; - use crate::common::HasLen; + use common::HasLen; #[test] fn test_delete_bitset_empty() { diff --git a/src/fastfield/mod.rs b/src/fastfield/mod.rs index 24cf4872b..a3dc8c17f 100644 --- a/src/fastfield/mod.rs +++ b/src/fastfield/mod.rs @@ -40,11 +40,11 @@ pub use self::writer::{FastFieldsWriter, IntFastFieldWriter}; use crate::schema::Cardinality; use crate::schema::FieldType; use crate::schema::Value; +use crate::DocId; use crate::{ chrono::{NaiveDateTime, Utc}, schema::Type, }; -use crate::{common, DocId}; mod bytes; mod delete; @@ -214,7 +214,6 @@ mod tests { use super::*; use crate::directory::CompositeFile; - use crate::common::HasLen; use crate::directory::{Directory, RamDirectory, WritePtr}; use crate::merge_policy::NoMergePolicy; use crate::schema::Field; @@ -222,6 +221,7 @@ mod tests { use crate::schema::FAST; use crate::schema::{Document, IntOptions}; use crate::{Index, SegmentId, SegmentReader}; + use common::HasLen; use once_cell::sync::Lazy; use rand::prelude::SliceRandom; use rand::rngs::StdRng; diff --git a/src/fastfield/reader.rs b/src/fastfield/reader.rs index b5902796e..0fbefb7d6 100644 --- a/src/fastfield/reader.rs +++ b/src/fastfield/reader.rs @@ -1,5 +1,4 @@ use super::FastValue; -use crate::common::BinarySerializable; use crate::directory::CompositeFile; use crate::directory::FileSlice; use crate::directory::OwnedBytes; @@ -8,6 +7,7 @@ use crate::fastfield::{CompositeFastFieldSerializer, FastFieldsWriter}; use crate::schema::Schema; use crate::schema::FAST; use crate::DocId; +use common::BinarySerializable; use fastfield_codecs::bitpacked::BitpackedFastFieldReader as BitpackedReader; use fastfield_codecs::bitpacked::BitpackedFastFieldSerializer; use fastfield_codecs::linearinterpol::LinearInterpolFastFieldReader; diff --git a/src/fastfield/serializer/mod.rs b/src/fastfield/serializer/mod.rs index 26a9edbbe..7d8ea8fe5 100644 --- a/src/fastfield/serializer/mod.rs +++ b/src/fastfield/serializer/mod.rs @@ -1,8 +1,8 @@ -use crate::common::BinarySerializable; use crate::directory::CompositeWrite; -use crate::common::CountingWriter; use crate::directory::WritePtr; use crate::schema::Field; +use common::BinarySerializable; +use common::CountingWriter; pub use fastfield_codecs::bitpacked::BitpackedFastFieldSerializer; pub use fastfield_codecs::bitpacked::BitpackedFastFieldSerializerLegacy; use fastfield_codecs::linearinterpol::LinearInterpolFastFieldSerializer; diff --git a/src/fastfield/writer.rs b/src/fastfield/writer.rs index 9e9893454..f7d4110ff 100644 --- a/src/fastfield/writer.rs +++ b/src/fastfield/writer.rs @@ -1,12 +1,12 @@ use super::multivalued::MultiValuedFastFieldWriter; use super::serializer::FastFieldStats; use super::FastFieldDataAccess; -use crate::common; use crate::fastfield::{BytesFastFieldWriter, CompositeFastFieldSerializer}; use crate::indexer::doc_id_mapping::DocIdMapping; use crate::postings::UnorderedTermId; use crate::schema::{Cardinality, Document, Field, FieldEntry, FieldType, Schema}; use crate::termdict::TermOrdinal; +use common; use fnv::FnvHashMap; use std::collections::HashMap; use std::io; diff --git a/src/indexer/index_writer.rs b/src/indexer/index_writer.rs index e644a1e61..f498099b9 100644 --- a/src/indexer/index_writer.rs +++ b/src/indexer/index_writer.rs @@ -1,7 +1,6 @@ use super::operation::{AddOperation, UserOperation}; use super::segment_updater::SegmentUpdater; use super::PreparedCommit; -use common::BitSet; use crate::core::Index; use crate::core::Segment; use crate::core::SegmentComponent; @@ -24,6 +23,7 @@ use crate::schema::Document; use crate::schema::IndexRecordOption; use crate::schema::Term; use crate::Opstamp; +use common::BitSet; use crossbeam::channel; use futures::executor::block_on; use futures::future::Future; diff --git a/src/indexer/merger.rs b/src/indexer/merger.rs index b737f533c..8d07506cc 100644 --- a/src/indexer/merger.rs +++ b/src/indexer/merger.rs @@ -5,6 +5,7 @@ use crate::fastfield::DynamicFastFieldReader; use crate::fastfield::FastFieldDataAccess; use crate::fastfield::FastFieldReader; use crate::fastfield::FastFieldStats; +use crate::fastfield::MultiValueLength; use crate::fastfield::MultiValuedFastFieldReader; use crate::fieldnorm::FieldNormsSerializer; use crate::fieldnorm::FieldNormsWriter; @@ -19,9 +20,8 @@ use crate::schema::{Field, Schema}; use crate::store::StoreWriter; use crate::termdict::TermMerger; use crate::termdict::TermOrdinal; +use crate::IndexSettings; use crate::IndexSortByField; -use crate::{common::HasLen, fastfield::MultiValueLength}; -use crate::{common::MAX_DOC_LIMIT, IndexSettings}; use crate::{core::Segment, indexer::doc_id_mapping::expect_field_id_for_sort_field}; use crate::{core::SegmentReader, Order}; use crate::{ @@ -29,6 +29,7 @@ use crate::{ SegmentOrdinal, }; use crate::{DocId, InvertedIndexReader, SegmentComponent}; +use common::HasLen; use itertools::Itertools; use measure_time::debug_time; use std::cmp; @@ -36,6 +37,11 @@ use std::collections::HashMap; use std::sync::Arc; use tantivy_bitpacker::minmax; +/// Segment's max doc must be `< MAX_DOC_LIMIT`. +/// +/// We do not allow segments with more than +pub const MAX_DOC_LIMIT: u32 = 1 << 31; + fn compute_total_num_tokens(readers: &[SegmentReader], field: Field) -> crate::Result { let mut total_tokens = 0u64; let mut count: [usize; 256] = [0; 256]; @@ -2075,4 +2081,11 @@ mod tests { Ok(()) } + + #[test] + fn test_max_doc() { + // this is the first time I write a unit test for a constant. + assert!(((super::MAX_DOC_LIMIT - 1) as i32) >= 0); + assert!((super::MAX_DOC_LIMIT as i32) < 0); + } } diff --git a/src/lib.rs b/src/lib.rs index bc1beb88d..fa1da43a3 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -135,7 +135,6 @@ pub type Result = std::result::Result; /// Tantivy DateTime pub type DateTime = chrono::DateTime; -mod common; mod core; mod indexer; @@ -163,8 +162,6 @@ pub use self::snippet::{Snippet, SnippetGenerator}; mod docset; pub use self::docset::{DocSet, TERMINATED}; -pub use crate::common::HasLen; -pub use crate::common::{f64_to_u64, i64_to_u64, u64_to_f64, u64_to_i64}; pub use crate::core::{Executor, SegmentComponent}; pub use crate::core::{ Index, IndexBuilder, IndexMeta, IndexSettings, IndexSortByField, Order, Searcher, Segment, @@ -178,6 +175,8 @@ pub use crate::indexer::IndexWriter; pub use crate::postings::Postings; pub use crate::reader::LeasedItem; pub use crate::schema::{Document, Term}; +pub use common::HasLen; +pub use common::{f64_to_u64, i64_to_u64, u64_to_f64, u64_to_i64}; use std::fmt; use once_cell::sync::Lazy; @@ -293,7 +292,7 @@ pub struct DocAddress { } #[cfg(test)] -mod tests { +pub mod tests { use crate::collector::tests::TEST_COLLECTOR_WITH_SCORE; use crate::core::SegmentReader; use crate::docset::{DocSet, TERMINATED}; @@ -304,11 +303,18 @@ mod tests { use crate::Index; use crate::Postings; use crate::ReloadPolicy; + use common::{BinarySerializable, FixedSize}; use rand::distributions::Bernoulli; use rand::distributions::Uniform; use rand::rngs::StdRng; use rand::{Rng, SeedableRng}; + pub fn fixed_size_test() { + let mut buffer = Vec::new(); + O::default().serialize(&mut buffer).unwrap(); + assert_eq!(buffer.len(), O::SIZE_IN_BYTES); + } + /// Checks if left and right are close one to each other. /// Panics if the two values are more than 0.5% apart. #[macro_export] @@ -348,10 +354,6 @@ mod tests { .collect() } - pub fn sample(n: u32, ratio: f64) -> Vec { - sample_with_seed(n, ratio, 4) - } - #[test] #[cfg(not(feature = "lz4"))] fn test_version_string() { diff --git a/src/positions/reader.rs b/src/positions/reader.rs index 5a046ad1d..25f857cc1 100644 --- a/src/positions/reader.rs +++ b/src/positions/reader.rs @@ -1,9 +1,9 @@ use std::io; -use crate::common::{BinarySerializable, VInt}; use crate::directory::OwnedBytes; use crate::positions::COMPRESSION_BLOCK_SIZE; use crate::postings::compression::{BlockDecoder, VIntDecoder}; +use common::{BinarySerializable, VInt}; /// When accessing the position of a term, we get a positions_idx from the `Terminfo`. /// This means we need to skip to the `nth` positions efficiently. diff --git a/src/positions/serializer.rs b/src/positions/serializer.rs index 45a06c5c8..23f242335 100644 --- a/src/positions/serializer.rs +++ b/src/positions/serializer.rs @@ -1,7 +1,7 @@ -use crate::common::{BinarySerializable, CountingWriter, VInt}; use crate::positions::COMPRESSION_BLOCK_SIZE; use crate::postings::compression::BlockEncoder; use crate::postings::compression::VIntEncoder; +use common::{BinarySerializable, CountingWriter, VInt}; use std::io::{self, Write}; /// The PositionSerializer is in charge of serializing all of the positions diff --git a/src/postings/block_segment_postings.rs b/src/postings/block_segment_postings.rs index f3877b84f..d12e8e994 100644 --- a/src/postings/block_segment_postings.rs +++ b/src/postings/block_segment_postings.rs @@ -1,6 +1,5 @@ use std::io; -use crate::common::{BinarySerializable, VInt}; use crate::directory::FileSlice; use crate::directory::OwnedBytes; use crate::fieldnorm::FieldNormReader; @@ -9,6 +8,7 @@ use crate::postings::{BlockInfo, FreqReadingOption, SkipReader}; use crate::query::Bm25Weight; use crate::schema::IndexRecordOption; use crate::{DocId, Score, TERMINATED}; +use common::{BinarySerializable, VInt}; fn max_score>(mut it: I) -> Option { it.next().map(|first| it.fold(first, Score::max)) @@ -347,7 +347,6 @@ impl BlockSegmentPostings { #[cfg(test)] mod tests { use super::BlockSegmentPostings; - use crate::common::HasLen; use crate::core::Index; use crate::docset::{DocSet, TERMINATED}; use crate::postings::compression::COMPRESSION_BLOCK_SIZE; @@ -358,6 +357,7 @@ mod tests { use crate::schema::Term; use crate::schema::INDEXED; use crate::DocId; + use common::HasLen; #[test] fn test_empty_segment_postings() { diff --git a/src/postings/compression/mod.rs b/src/postings/compression/mod.rs index 138ebcdf1..84a250b65 100644 --- a/src/postings/compression/mod.rs +++ b/src/postings/compression/mod.rs @@ -1,5 +1,5 @@ -use crate::common::FixedSize; use bitpacking::{BitPacker, BitPacker4x}; +use common::FixedSize; pub const COMPRESSION_BLOCK_SIZE: usize = BitPacker4x::BLOCK_LEN; const COMPRESSED_BLOCK_MAX_SIZE: usize = COMPRESSION_BLOCK_SIZE * u32::SIZE_IN_BYTES; diff --git a/src/postings/recorder.rs b/src/postings/recorder.rs index a77d5327b..11e8447fd 100644 --- a/src/postings/recorder.rs +++ b/src/postings/recorder.rs @@ -1,10 +1,8 @@ use super::stacker::{ExpUnrolledLinkedList, MemoryArena}; +use crate::indexer::doc_id_mapping::DocIdMapping; use crate::postings::FieldSerializer; use crate::DocId; -use crate::{ - common::{read_u32_vint, write_u32_vint}, - indexer::doc_id_mapping::DocIdMapping, -}; +use common::{read_u32_vint, write_u32_vint}; const POSITION_END: u32 = 0; diff --git a/src/postings/segment_postings.rs b/src/postings/segment_postings.rs index eaf36440f..aa470d99f 100644 --- a/src/postings/segment_postings.rs +++ b/src/postings/segment_postings.rs @@ -1,4 +1,3 @@ -use crate::common::HasLen; use crate::docset::DocSet; use crate::fastfield::DeleteBitSet; use crate::positions::PositionReader; @@ -7,6 +6,7 @@ use crate::postings::compression::COMPRESSION_BLOCK_SIZE; use crate::postings::BlockSegmentPostings; use crate::postings::Postings; use crate::{DocId, TERMINATED}; +use common::HasLen; /// `SegmentPostings` represents the inverted list or postings associated to /// a term in a `Segment`. @@ -265,7 +265,7 @@ impl Postings for SegmentPostings { mod tests { use super::SegmentPostings; - use crate::common::HasLen; + use common::HasLen; use crate::docset::{DocSet, TERMINATED}; use crate::fastfield::DeleteBitSet; diff --git a/src/postings/serializer.rs b/src/postings/serializer.rs index eca28a7c8..40e3ca2ac 100644 --- a/src/postings/serializer.rs +++ b/src/postings/serializer.rs @@ -1,6 +1,4 @@ use super::TermInfo; -use crate::common::CountingWriter; -use crate::common::{BinarySerializable, VInt}; use crate::core::Segment; use crate::directory::CompositeWrite; use crate::directory::WritePtr; @@ -13,6 +11,8 @@ use crate::schema::{Field, FieldEntry, FieldType}; use crate::schema::{IndexRecordOption, Schema}; use crate::termdict::{TermDictionaryBuilder, TermOrdinal}; use crate::{DocId, Score}; +use common::CountingWriter; +use common::{BinarySerializable, VInt}; use std::cmp::Ordering; use std::io::{self, Write}; diff --git a/src/postings/term_info.rs b/src/postings/term_info.rs index b86e56b2f..8703b5589 100644 --- a/src/postings/term_info.rs +++ b/src/postings/term_info.rs @@ -1,4 +1,4 @@ -use crate::common::{BinarySerializable, FixedSize}; +use common::{BinarySerializable, FixedSize}; use std::io; use std::iter::ExactSizeIterator; use std::ops::Range; @@ -67,7 +67,7 @@ impl BinarySerializable for TermInfo { mod tests { use super::TermInfo; - use crate::common::test::fixed_size_test; + use crate::tests::fixed_size_test; // TODO add serialize/deserialize test for terminfo diff --git a/src/query/automaton_weight.rs b/src/query/automaton_weight.rs index 5f9f646ce..ae9bbc45e 100644 --- a/src/query/automaton_weight.rs +++ b/src/query/automaton_weight.rs @@ -1,4 +1,3 @@ -use common::BitSet; use crate::core::SegmentReader; use crate::query::ConstScorer; use crate::query::{BitSetDocSet, Explanation}; @@ -7,6 +6,7 @@ use crate::schema::{Field, IndexRecordOption}; use crate::termdict::{TermDictionary, TermStreamer}; use crate::TantivyError; use crate::{DocId, Score}; +use common::BitSet; use std::io; use std::sync::Arc; use tantivy_fst::Automaton; diff --git a/src/query/vec_docset.rs b/src/query/vec_docset.rs index 89f32bd7f..3f765ef58 100644 --- a/src/query/vec_docset.rs +++ b/src/query/vec_docset.rs @@ -1,8 +1,8 @@ #![allow(dead_code)] -use crate::common::HasLen; use crate::docset::{DocSet, TERMINATED}; use crate::DocId; +use common::HasLen; /// Simulate a `Postings` objects from a `VecPostings`. /// `VecPostings` only exist for testing purposes. diff --git a/src/schema/document.rs b/src/schema/document.rs index 1887821f2..dc9fe4ba9 100644 --- a/src/schema/document.rs +++ b/src/schema/document.rs @@ -1,8 +1,8 @@ use super::*; -use crate::common::BinarySerializable; -use crate::common::VInt; use crate::tokenizer::PreTokenizedString; use crate::DateTime; +use common::BinarySerializable; +use common::VInt; use std::io::{self, Read, Write}; use std::mem; diff --git a/src/schema/facet.rs b/src/schema/facet.rs index 37dbec983..8cbb3b020 100644 --- a/src/schema/facet.rs +++ b/src/schema/facet.rs @@ -1,4 +1,4 @@ -use crate::common::BinarySerializable; +use common::BinarySerializable; use once_cell::sync::Lazy; use regex::Regex; use serde::{Deserialize, Deserializer, Serialize, Serializer}; diff --git a/src/schema/field.rs b/src/schema/field.rs index 13ec3d131..ee8348e3c 100644 --- a/src/schema/field.rs +++ b/src/schema/field.rs @@ -1,4 +1,4 @@ -use crate::common::BinarySerializable; +use common::BinarySerializable; use std::io; use std::io::Read; use std::io::Write; diff --git a/src/schema/field_value.rs b/src/schema/field_value.rs index 1d1b7ec7f..4dbe15afb 100644 --- a/src/schema/field_value.rs +++ b/src/schema/field_value.rs @@ -1,6 +1,6 @@ -use crate::common::BinarySerializable; use crate::schema::Field; use crate::schema::Value; +use common::BinarySerializable; use std::io::{self, Read, Write}; /// `FieldValue` holds together a `Field` and its `Value`. diff --git a/src/schema/term.rs b/src/schema/term.rs index 0662e5230..149eab9ee 100644 --- a/src/schema/term.rs +++ b/src/schema/term.rs @@ -1,9 +1,9 @@ use std::fmt; use super::Field; -use crate::common; use crate::schema::Facet; use crate::DateTime; +use common; use std::str; /// Size (in bytes) of the buffer of a int field. diff --git a/src/schema/value.rs b/src/schema/value.rs index f34b1fb82..b3b49a8eb 100644 --- a/src/schema/value.rs +++ b/src/schema/value.rs @@ -276,10 +276,10 @@ impl From for Value { mod binary_serialize { use super::Value; - use crate::common::{f64_to_u64, u64_to_f64, BinarySerializable}; use crate::schema::Facet; use crate::tokenizer::PreTokenizedString; use chrono::{TimeZone, Utc}; + use common::{f64_to_u64, u64_to_f64, BinarySerializable}; use std::io::{self, Read, Write}; const TEXT_CODE: u8 = 0; diff --git a/src/store/footer.rs b/src/store/footer.rs index 1c5f2817b..6f63f8170 100644 --- a/src/store/footer.rs +++ b/src/store/footer.rs @@ -1,8 +1,5 @@ -use crate::{ - common::{BinarySerializable, FixedSize, HasLen}, - directory::FileSlice, - store::Compressor, -}; +use crate::{directory::FileSlice, store::Compressor}; +use common::{BinarySerializable, FixedSize, HasLen}; use std::io; #[derive(Debug, Clone, PartialEq)] diff --git a/src/store/index/block.rs b/src/store/index/block.rs index 3b49905b5..5915f1e13 100644 --- a/src/store/index/block.rs +++ b/src/store/index/block.rs @@ -1,6 +1,6 @@ -use crate::common::VInt; use crate::store::index::{Checkpoint, CHECKPOINT_PERIOD}; use crate::DocId; +use common::VInt; use std::io; use std::ops::Range; diff --git a/src/store/index/skip_index.rs b/src/store/index/skip_index.rs index 306eb7ca1..b69df319a 100644 --- a/src/store/index/skip_index.rs +++ b/src/store/index/skip_index.rs @@ -1,8 +1,8 @@ -use crate::common::{BinarySerializable, VInt}; use crate::directory::OwnedBytes; use crate::store::index::block::CheckpointBlock; use crate::store::index::Checkpoint; use crate::DocId; +use common::{BinarySerializable, VInt}; pub struct LayerCursor<'a> { remaining: &'a [u8], diff --git a/src/store/index/skip_index_builder.rs b/src/store/index/skip_index_builder.rs index 416f7bfa0..c9e311b92 100644 --- a/src/store/index/skip_index_builder.rs +++ b/src/store/index/skip_index_builder.rs @@ -1,6 +1,6 @@ -use crate::common::{BinarySerializable, VInt}; use crate::store::index::block::CheckpointBlock; use crate::store::index::{Checkpoint, CHECKPOINT_PERIOD}; +use common::{BinarySerializable, VInt}; use std::io; use std::io::Write; diff --git a/src/store/reader.rs b/src/store/reader.rs index 64cef7339..3ff04f691 100644 --- a/src/store/reader.rs +++ b/src/store/reader.rs @@ -5,11 +5,8 @@ use crate::schema::Document; use crate::space_usage::StoreSpaceUsage; use crate::store::index::Checkpoint; use crate::DocId; -use crate::{ - common::{BinarySerializable, HasLen, VInt}, - error::DataCorruption, - fastfield::DeleteBitSet, -}; +use crate::{error::DataCorruption, fastfield::DeleteBitSet}; +use common::{BinarySerializable, HasLen, VInt}; use lru::LruCache; use std::io; use std::sync::atomic::{AtomicUsize, Ordering}; diff --git a/src/store/writer.rs b/src/store/writer.rs index d208920ca..d7004c0f6 100644 --- a/src/store/writer.rs +++ b/src/store/writer.rs @@ -1,13 +1,13 @@ use super::index::SkipIndexBuilder; use super::StoreReader; use super::{compressors::Compressor, footer::DocStoreFooter}; -use crate::common::CountingWriter; -use crate::common::{BinarySerializable, VInt}; use crate::directory::TerminatingWrite; use crate::directory::WritePtr; use crate::schema::Document; use crate::store::index::Checkpoint; use crate::DocId; +use common::CountingWriter; +use common::{BinarySerializable, VInt}; use std::io::{self, Write}; const BLOCK_SIZE: usize = 16_384; diff --git a/src/termdict/fst_termdict/term_info_store.rs b/src/termdict/fst_termdict/term_info_store.rs index e78d7f2cd..28d463226 100644 --- a/src/termdict/fst_termdict/term_info_store.rs +++ b/src/termdict/fst_termdict/term_info_store.rs @@ -1,8 +1,8 @@ -use crate::common::{BinarySerializable, FixedSize}; use crate::directory::{FileSlice, OwnedBytes}; use crate::postings::TermInfo; use crate::termdict::TermOrdinal; use byteorder::{ByteOrder, LittleEndian}; +use common::{BinarySerializable, FixedSize}; use std::cmp; use std::io::{self, Read, Write}; use tantivy_bitpacker::compute_num_bits; @@ -290,16 +290,16 @@ mod tests { use super::extract_bits; use super::TermInfoBlockMeta; use super::{TermInfoStore, TermInfoStoreWriter}; - use crate::common; - use crate::common::BinarySerializable; use crate::directory::FileSlice; use crate::postings::TermInfo; + use common; + use common::BinarySerializable; use tantivy_bitpacker::compute_num_bits; use tantivy_bitpacker::BitPacker; #[test] fn test_term_info_block() { - common::test::fixed_size_test::(); + crate::tests::fixed_size_test::(); } #[test] diff --git a/src/termdict/fst_termdict/termdict.rs b/src/termdict/fst_termdict/termdict.rs index ff0d4ec5f..078e12054 100644 --- a/src/termdict/fst_termdict/termdict.rs +++ b/src/termdict/fst_termdict/termdict.rs @@ -1,10 +1,10 @@ use super::term_info_store::{TermInfoStore, TermInfoStoreWriter}; use super::{TermStreamer, TermStreamerBuilder}; -use crate::common::{BinarySerializable, CountingWriter}; use crate::directory::{FileSlice, OwnedBytes}; use crate::error::DataCorruption; use crate::postings::TermInfo; use crate::termdict::TermOrdinal; +use common::{BinarySerializable, CountingWriter}; use once_cell::sync::Lazy; use std::io::{self, Write}; use tantivy_fst::raw::Fst;