diff --git a/common/Cargo.toml b/common/Cargo.toml index d262da05e..b9f09b950 100644 --- a/common/Cargo.toml +++ b/common/Cargo.toml @@ -10,3 +10,6 @@ description = "common traits and utility functions used by multiple tantivy subc [dependencies] byteorder = "1.4.3" + +[dev-dependencies] +rand = "0.8.4" diff --git a/src/common/bitset.rs b/common/src/bitset.rs similarity index 89% rename from src/common/bitset.rs rename to common/src/bitset.rs index 0eb4d8da6..942a94269 100644 --- a/src/common/bitset.rs +++ b/common/src/bitset.rs @@ -2,7 +2,7 @@ use std::fmt; use std::u64; #[derive(Clone, Copy, Eq, PartialEq)] -pub(crate) struct TinySet(u64); +pub struct TinySet(u64); impl fmt::Debug for TinySet { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { @@ -178,7 +178,7 @@ impl BitSet { /// /// Reminder: the tiny set with the bucket `bucket`, represents the /// elements from `bucket * 64` to `(bucket+1) * 64`. - pub(crate) fn first_non_empty_bucket(&self, bucket: u32) -> Option { + pub fn first_non_empty_bucket(&self, bucket: u32) -> Option { self.tinysets[bucket as usize..] .iter() .cloned() @@ -193,7 +193,7 @@ impl BitSet { /// Returns the tiny bitset representing the /// the set restricted to the number range from /// `bucket * 64` to `(bucket + 1) * 64`. - pub(crate) fn tinyset(&self, bucket: u32) -> TinySet { + pub fn tinyset(&self, bucket: u32) -> TinySet { self.tinysets[bucket as usize] } } @@ -203,11 +203,9 @@ mod tests { use super::BitSet; use super::TinySet; - use crate::docset::{DocSet, TERMINATED}; - use crate::query::BitSetDocSet; - use crate::tests; - use crate::tests::generate_nonunique_unsorted; - use std::collections::BTreeSet; + use rand::distributions::Bernoulli; + use rand::rngs::StdRng; + use rand::{Rng, SeedableRng}; use std::collections::HashSet; #[test] @@ -263,29 +261,6 @@ mod tests { test_against_hashset(&[62u32, 63u32], 64); } - #[test] - fn test_bitset_large() { - let arr = generate_nonunique_unsorted(100_000, 5_000); - let mut btreeset: BTreeSet = BTreeSet::new(); - let mut bitset = BitSet::with_max_value(100_000); - for el in arr { - btreeset.insert(el); - bitset.insert(el); - } - for i in 0..100_000 { - assert_eq!(btreeset.contains(&i), bitset.contains(i)); - } - assert_eq!(btreeset.len(), bitset.len()); - let mut bitset_docset = BitSetDocSet::from(bitset); - let mut remaining = true; - for el in btreeset.into_iter() { - assert!(remaining); - assert_eq!(bitset_docset.doc(), el); - remaining = bitset_docset.advance() != TERMINATED; - } - assert!(!remaining); - } - #[test] fn test_bitset_num_buckets() { use super::num_buckets; @@ -340,10 +315,23 @@ mod tests { assert_eq!(bitset.len(), 3); } + pub fn sample_with_seed(n: u32, ratio: f64, seed_val: u8) -> Vec { + StdRng::from_seed([seed_val; 32]) + .sample_iter(&Bernoulli::new(ratio).unwrap()) + .take(n as usize) + .enumerate() + .filter_map(|(val, keep)| if keep { Some(val as u32) } else { None }) + .collect() + } + + pub fn sample(n: u32, ratio: f64) -> Vec { + sample_with_seed(n, ratio, 4) + } + #[test] fn test_bitset_clear() { let mut bitset = BitSet::with_max_value(1_000); - let els = tests::sample(1_000, 0.01f64); + let els = sample(1_000, 0.01f64); for &el in &els { bitset.insert(el); } diff --git a/common/src/lib.rs b/common/src/lib.rs index b3c24163b..46b2e0222 100644 --- a/common/src/lib.rs +++ b/common/src/lib.rs @@ -1,9 +1,11 @@ pub use byteorder::LittleEndian as Endianness; +mod bitset; mod serialize; mod vint; mod writer; +pub use bitset::*; pub use serialize::{BinarySerializable, DeserializeFrom, FixedSize}; pub use vint::{read_u32_vint, read_u32_vint_no_advance, serialize_vint_u32, write_u32_vint, VInt}; pub use writer::{AntiCallToken, CountingWriter, TerminatingWrite}; diff --git a/src/common/mod.rs b/src/common/mod.rs index b82b352f8..cb1f6949b 100644 --- a/src/common/mod.rs +++ b/src/common/mod.rs @@ -1,9 +1,3 @@ -mod bitset; -mod composite_file; - -pub use self::bitset::BitSet; -pub(crate) use self::bitset::TinySet; -pub(crate) use self::composite_file::{CompositeFile, CompositeWrite}; pub use byteorder::LittleEndian as Endianness; pub use common::CountingWriter; pub use common::{ diff --git a/src/core/segment_reader.rs b/src/core/segment_reader.rs index 27110ce31..73de5fb4c 100644 --- a/src/core/segment_reader.rs +++ b/src/core/segment_reader.rs @@ -2,7 +2,9 @@ use crate::core::InvertedIndexReader; use crate::core::Segment; use crate::core::SegmentComponent; use crate::core::SegmentId; +use crate::directory::CompositeFile; use crate::directory::FileSlice; +use crate::error::DataCorruption; use crate::fastfield::DeleteBitSet; use crate::fastfield::FacetReader; use crate::fastfield::FastFieldReaders; @@ -14,7 +16,6 @@ use crate::space_usage::SegmentSpaceUsage; use crate::store::StoreReader; use crate::termdict::TermDictionary; use crate::DocId; -use crate::{common::CompositeFile, error::DataCorruption}; use fail::fail_point; use std::fmt; use std::sync::Arc; diff --git a/src/common/composite_file.rs b/src/directory/composite_file.rs similarity index 99% rename from src/common/composite_file.rs rename to src/directory/composite_file.rs index babee87d1..5619f803c 100644 --- a/src/common/composite_file.rs +++ b/src/directory/composite_file.rs @@ -1,5 +1,6 @@ use crate::common::BinarySerializable; use crate::common::CountingWriter; +use crate::common::HasLen; use crate::common::VInt; use crate::directory::FileSlice; use crate::directory::{TerminatingWrite, WritePtr}; @@ -11,8 +12,6 @@ use std::io::{self, Read, Write}; use std::iter::ExactSizeIterator; use std::ops::Range; -use super::HasLen; - #[derive(Eq, PartialEq, Hash, Copy, Ord, PartialOrd, Clone, Debug)] pub struct FileAddr { field: Field, diff --git a/src/directory/mod.rs b/src/directory/mod.rs index fcfe90342..fcf6fdd35 100644 --- a/src/directory/mod.rs +++ b/src/directory/mod.rs @@ -20,6 +20,9 @@ mod watch_event_router; /// Errors specific to the directory module. pub mod error; +mod composite_file; + +pub(crate) use self::composite_file::{CompositeFile, CompositeWrite}; pub use self::directory::DirectoryLock; pub use self::directory::{Directory, DirectoryClone}; pub use self::directory_lock::{Lock, INDEX_WRITER_LOCK, META_LOCK}; diff --git a/src/fastfield/delete.rs b/src/fastfield/delete.rs index a2f14aa7c..fbf3c33dd 100644 --- a/src/fastfield/delete.rs +++ b/src/fastfield/delete.rs @@ -1,4 +1,5 @@ -use crate::common::{BitSet, HasLen}; +use crate::common::HasLen; +use common::BitSet; use crate::directory::FileSlice; use crate::directory::OwnedBytes; use crate::directory::WritePtr; diff --git a/src/fastfield/mod.rs b/src/fastfield/mod.rs index bc46e5bec..24cf4872b 100644 --- a/src/fastfield/mod.rs +++ b/src/fastfield/mod.rs @@ -213,7 +213,7 @@ fn value_to_u64(value: &Value) -> u64 { mod tests { use super::*; - use crate::common::CompositeFile; + use crate::directory::CompositeFile; use crate::common::HasLen; use crate::directory::{Directory, RamDirectory, WritePtr}; use crate::merge_policy::NoMergePolicy; @@ -588,7 +588,7 @@ mod bench { use super::tests::FIELD; use super::tests::{generate_permutation, SCHEMA}; use super::*; - use crate::common::CompositeFile; + use crate::directory::CompositeFile; use crate::directory::{Directory, RamDirectory, WritePtr}; use crate::fastfield::FastFieldReader; use std::collections::HashMap; diff --git a/src/fastfield/reader.rs b/src/fastfield/reader.rs index 47adcddfb..b5902796e 100644 --- a/src/fastfield/reader.rs +++ b/src/fastfield/reader.rs @@ -1,6 +1,6 @@ use super::FastValue; use crate::common::BinarySerializable; -use crate::common::CompositeFile; +use crate::directory::CompositeFile; use crate::directory::FileSlice; use crate::directory::OwnedBytes; use crate::directory::{Directory, RamDirectory, WritePtr}; diff --git a/src/fastfield/readers.rs b/src/fastfield/readers.rs index f202bbb35..b85754641 100644 --- a/src/fastfield/readers.rs +++ b/src/fastfield/readers.rs @@ -1,4 +1,4 @@ -use crate::common::CompositeFile; +use crate::directory::CompositeFile; use crate::directory::FileSlice; use crate::fastfield::MultiValuedFastFieldReader; use crate::fastfield::{BitpackedFastFieldReader, FastFieldNotAvailableError}; diff --git a/src/fastfield/serializer/mod.rs b/src/fastfield/serializer/mod.rs index 2279ce1b5..26a9edbbe 100644 --- a/src/fastfield/serializer/mod.rs +++ b/src/fastfield/serializer/mod.rs @@ -1,5 +1,5 @@ use crate::common::BinarySerializable; -use crate::common::CompositeWrite; +use crate::directory::CompositeWrite; use crate::common::CountingWriter; use crate::directory::WritePtr; use crate::schema::Field; diff --git a/src/fieldnorm/reader.rs b/src/fieldnorm/reader.rs index 71535e3f9..ea264b2ff 100644 --- a/src/fieldnorm/reader.rs +++ b/src/fieldnorm/reader.rs @@ -1,5 +1,5 @@ use super::{fieldnorm_to_id, id_to_fieldnorm}; -use crate::common::CompositeFile; +use crate::directory::CompositeFile; use crate::directory::FileSlice; use crate::directory::OwnedBytes; use crate::schema::Field; diff --git a/src/fieldnorm/serializer.rs b/src/fieldnorm/serializer.rs index 057626fcc..54043b1e9 100644 --- a/src/fieldnorm/serializer.rs +++ b/src/fieldnorm/serializer.rs @@ -1,4 +1,4 @@ -use crate::common::CompositeWrite; +use crate::directory::CompositeWrite; use crate::directory::WritePtr; use crate::schema::Field; use std::io; diff --git a/src/indexer/index_writer.rs b/src/indexer/index_writer.rs index 183741a20..e644a1e61 100644 --- a/src/indexer/index_writer.rs +++ b/src/indexer/index_writer.rs @@ -1,7 +1,7 @@ use super::operation::{AddOperation, UserOperation}; use super::segment_updater::SegmentUpdater; use super::PreparedCommit; -use crate::common::BitSet; +use common::BitSet; use crate::core::Index; use crate::core::Segment; use crate::core::SegmentComponent; diff --git a/src/indexer/segment_entry.rs b/src/indexer/segment_entry.rs index 1808fd1da..4ac352e50 100644 --- a/src/indexer/segment_entry.rs +++ b/src/indexer/segment_entry.rs @@ -1,7 +1,7 @@ -use crate::common::BitSet; use crate::core::SegmentId; use crate::core::SegmentMeta; use crate::indexer::delete_queue::DeleteCursor; +use common::BitSet; use std::fmt; /// A segment entry describes the state of diff --git a/src/postings/serializer.rs b/src/postings/serializer.rs index 8631586c2..eca28a7c8 100644 --- a/src/postings/serializer.rs +++ b/src/postings/serializer.rs @@ -1,7 +1,8 @@ use super::TermInfo; +use crate::common::CountingWriter; use crate::common::{BinarySerializable, VInt}; -use crate::common::{CompositeWrite, CountingWriter}; use crate::core::Segment; +use crate::directory::CompositeWrite; use crate::directory::WritePtr; use crate::fieldnorm::FieldNormReader; use crate::positions::PositionSerializer; diff --git a/src/query/automaton_weight.rs b/src/query/automaton_weight.rs index 2ffa4309a..5f9f646ce 100644 --- a/src/query/automaton_weight.rs +++ b/src/query/automaton_weight.rs @@ -1,4 +1,4 @@ -use crate::common::BitSet; +use common::BitSet; use crate::core::SegmentReader; use crate::query::ConstScorer; use crate::query::{BitSetDocSet, Explanation}; diff --git a/src/query/bitset/mod.rs b/src/query/bitset/mod.rs index b74ccd2e8..030fdeae7 100644 --- a/src/query/bitset/mod.rs +++ b/src/query/bitset/mod.rs @@ -1,6 +1,6 @@ -use crate::common::{BitSet, TinySet}; use crate::docset::{DocSet, TERMINATED}; use crate::DocId; +use common::{BitSet, TinySet}; /// A `BitSetDocSet` makes it possible to iterate through a bitset as if it was a `DocSet`. /// @@ -96,10 +96,13 @@ impl DocSet for BitSetDocSet { #[cfg(test)] mod tests { + use std::collections::BTreeSet; + use super::BitSetDocSet; - use crate::common::BitSet; use crate::docset::{DocSet, TERMINATED}; + use crate::tests::generate_nonunique_unsorted; use crate::DocId; + use common::BitSet; fn create_docbitset(docs: &[DocId], max_doc: DocId) -> BitSetDocSet { let mut docset = BitSet::with_max_value(max_doc); @@ -109,6 +112,29 @@ mod tests { BitSetDocSet::from(docset) } + #[test] + fn test_bitset_large() { + let arr = generate_nonunique_unsorted(100_000, 5_000); + let mut btreeset: BTreeSet = BTreeSet::new(); + let mut bitset = BitSet::with_max_value(100_000); + for el in arr { + btreeset.insert(el); + bitset.insert(el); + } + for i in 0..100_000 { + assert_eq!(btreeset.contains(&i), bitset.contains(i)); + } + assert_eq!(btreeset.len(), bitset.len()); + let mut bitset_docset = BitSetDocSet::from(bitset); + let mut remaining = true; + for el in btreeset.into_iter() { + assert!(remaining); + assert_eq!(bitset_docset.doc(), el); + remaining = bitset_docset.advance() != TERMINATED; + } + assert!(!remaining); + } + #[test] fn test_empty() { let bitset = BitSet::with_max_value(1000); diff --git a/src/query/range_query.rs b/src/query/range_query.rs index fa230d015..a625a3354 100644 --- a/src/query/range_query.rs +++ b/src/query/range_query.rs @@ -1,4 +1,3 @@ -use crate::common::BitSet; use crate::core::Searcher; use crate::core::SegmentReader; use crate::error::TantivyError; @@ -10,6 +9,7 @@ use crate::schema::Type; use crate::schema::{Field, IndexRecordOption, Term}; use crate::termdict::{TermDictionary, TermStreamer}; use crate::{DocId, Score}; +use common::BitSet; use std::io; use std::ops::{Bound, Range}; diff --git a/src/query/union.rs b/src/query/union.rs index 8185f7c6c..cf7b4d956 100644 --- a/src/query/union.rs +++ b/src/query/union.rs @@ -1,9 +1,9 @@ -use crate::common::TinySet; use crate::docset::{DocSet, TERMINATED}; use crate::query::score_combiner::{DoNothingCombiner, ScoreCombiner}; use crate::query::Scorer; use crate::DocId; use crate::Score; +use common::TinySet; const HORIZON_NUM_TINYBITSETS: usize = 64; const HORIZON: u32 = 64u32 * HORIZON_NUM_TINYBITSETS as u32;