diff --git a/fastfield_codecs/benches/bench.rs b/fastfield_codecs/benches/bench.rs index c42466646..1f4eec67e 100644 --- a/fastfield_codecs/benches/bench.rs +++ b/fastfield_codecs/benches/bench.rs @@ -4,7 +4,7 @@ extern crate test; #[cfg(test)] mod tests { - use std::iter; + use std::ops::RangeInclusive; use std::sync::Arc; use common::OwnedBytes; @@ -71,27 +71,24 @@ mod tests { }); } - fn get_exp_data() -> Vec { + const FIFTY_PERCENT_RANGE: RangeInclusive = 1..=50; + const SINGLE_ITEM: u64 = 90; + const SINGLE_ITEM_RANGE: RangeInclusive = 90..=90; + const ONE_PERCENT_ITEM_RANGE: RangeInclusive = 49..=49; + fn get_data_50percent_item() -> Vec { + let mut rng = StdRng::from_seed([1u8; 32]); + let mut data = vec![]; - for i in 0..100 { - let num = i * i; - data.extend(iter::repeat(i as u64).take(num)); + for _ in 0..300_000 { + let val = rng.gen_range(1..=100); + data.push(val); } - data.shuffle(&mut StdRng::from_seed([1u8; 32])); + data.push(SINGLE_ITEM); - // lengt = 328350 + data.shuffle(&mut rng); + let data = data.iter().map(|el| *el as u128).collect::>(); data } - - fn get_data_50percent_item() -> (u128, u128, Vec) { - let mut permutation = get_exp_data(); - let major_item = 20; - let minor_item = 10; - permutation.extend(iter::repeat(major_item).take(permutation.len())); - permutation.shuffle(&mut StdRng::from_seed([1u8; 32])); - let permutation = permutation.iter().map(|el| *el as u128).collect::>(); - (major_item as u128, minor_item as u128, permutation) - } fn get_u128_column_random() -> Arc> { let permutation = generate_random(); let permutation = permutation.iter().map(|el| *el as u128).collect::>(); @@ -106,15 +103,82 @@ mod tests { open_u128::(out).unwrap() } + // U64 RANGE START + #[bench] + fn bench_intfastfield_getrange_u64_50percent_hit(b: &mut Bencher) { + let data = get_data_50percent_item(); + let data = data.iter().map(|el| *el as u64).collect::>(); + let column: Arc> = serialize_and_load(&data); + + b.iter(|| { + let mut positions = Vec::new(); + column.get_docids_for_value_range( + FIFTY_PERCENT_RANGE, + 0..data.len() as u32, + &mut positions, + ); + positions + }); + } + + #[bench] + fn bench_intfastfield_getrange_u64_1percent_hit(b: &mut Bencher) { + let data = get_data_50percent_item(); + let data = data.iter().map(|el| *el as u64).collect::>(); + let column: Arc> = serialize_and_load(&data); + + b.iter(|| { + let mut positions = Vec::new(); + column.get_docids_for_value_range( + ONE_PERCENT_ITEM_RANGE, + 0..data.len() as u32, + &mut positions, + ); + positions + }); + } + + #[bench] + fn bench_intfastfield_getrange_u64_single_hit(b: &mut Bencher) { + let data = get_data_50percent_item(); + let data = data.iter().map(|el| *el as u64).collect::>(); + let column: Arc> = serialize_and_load(&data); + + b.iter(|| { + let mut positions = Vec::new(); + column.get_docids_for_value_range( + SINGLE_ITEM_RANGE, + 0..data.len() as u32, + &mut positions, + ); + positions + }); + } + + #[bench] + fn bench_intfastfield_getrange_u64_hit_all(b: &mut Bencher) { + let data = get_data_50percent_item(); + let data = data.iter().map(|el| *el as u64).collect::>(); + let column: Arc> = serialize_and_load(&data); + + b.iter(|| { + let mut positions = Vec::new(); + column.get_docids_for_value_range(0..=u64::MAX, 0..data.len() as u32, &mut positions); + positions + }); + } + // U64 RANGE END + + // U128 RANGE START #[bench] fn bench_intfastfield_getrange_u128_50percent_hit(b: &mut Bencher) { - let (major_item, _minor_item, data) = get_data_50percent_item(); + let data = get_data_50percent_item(); let column = get_u128_column_from_data(&data); b.iter(|| { let mut positions = Vec::new(); column.get_docids_for_value_range( - major_item..=major_item, + *FIFTY_PERCENT_RANGE.start() as u128..=*FIFTY_PERCENT_RANGE.end() as u128, 0..data.len() as u32, &mut positions, ); @@ -124,13 +188,13 @@ mod tests { #[bench] fn bench_intfastfield_getrange_u128_single_hit(b: &mut Bencher) { - let (_major_item, minor_item, data) = get_data_50percent_item(); + let data = get_data_50percent_item(); let column = get_u128_column_from_data(&data); b.iter(|| { let mut positions = Vec::new(); column.get_docids_for_value_range( - minor_item..=minor_item, + *SINGLE_ITEM_RANGE.start() as u128..=*SINGLE_ITEM_RANGE.end() as u128, 0..data.len() as u32, &mut positions, ); @@ -140,7 +204,7 @@ mod tests { #[bench] fn bench_intfastfield_getrange_u128_hit_all(b: &mut Bencher) { - let (_major_item, _minor_item, data) = get_data_50percent_item(); + let data = get_data_50percent_item(); let column = get_u128_column_from_data(&data); b.iter(|| { @@ -149,6 +213,7 @@ mod tests { positions }); } + // U128 RANGE END #[bench] fn bench_intfastfield_scan_all_fflookup_u128(b: &mut Bencher) { diff --git a/src/fastfield/multivalued/reader.rs b/src/fastfield/multivalued/reader.rs index 57484ca82..0e68476cc 100644 --- a/src/fastfield/multivalued/reader.rs +++ b/src/fastfield/multivalued/reader.rs @@ -122,8 +122,78 @@ impl MultiValuedFastFieldReader { #[cfg(test)] mod tests { + use time::{Duration, OffsetDateTime}; + + use crate::collector::Count; use crate::core::Index; + use crate::query::RangeQuery; use crate::schema::{Cardinality, Facet, FacetOptions, NumericOptions, Schema}; + use crate::{DateOptions, DateTime}; + + #[test] + fn test_multivalued_date_docids_for_value_range() -> crate::Result<()> { + let mut schema_builder = Schema::builder(); + let date_field = schema_builder.add_date_field( + "multi_date_field", + DateOptions::default() + .set_fast(Cardinality::MultiValues) + .set_indexed() + .set_fieldnorm() + .set_stored(), + ); + let schema = schema_builder.build(); + let index = Index::create_in_ram(schema); + let mut index_writer = index.writer_for_tests()?; + let first_time_stamp = OffsetDateTime::now_utc(); + index_writer.add_document(doc!( + date_field => DateTime::from_utc(first_time_stamp), + date_field => DateTime::from_utc(first_time_stamp), + ))?; + index_writer.add_document(doc!())?; + // add one second + index_writer.add_document(doc!( + date_field => DateTime::from_utc(first_time_stamp + Duration::seconds(1)), + ))?; + // add another second + let two_secs_ahead = first_time_stamp + Duration::seconds(2); + index_writer.add_document(doc!( + date_field => DateTime::from_utc(two_secs_ahead), + date_field => DateTime::from_utc(two_secs_ahead), + date_field => DateTime::from_utc(two_secs_ahead), + ))?; + // add three seconds + index_writer.add_document(doc!( + date_field => DateTime::from_utc(first_time_stamp + Duration::seconds(3)), + ))?; + index_writer.commit()?; + + let reader = index.reader()?; + let searcher = reader.searcher(); + let reader = searcher.segment_reader(0); + assert_eq!(reader.num_docs(), 5); + + let date_ff_reader = reader.fast_fields().dates(date_field).unwrap(); + let mut docids = vec![]; + date_ff_reader.get_docids_for_value_range( + DateTime::from_utc(first_time_stamp)..=DateTime::from_utc(two_secs_ahead), + 0..5, + &mut docids, + ); + assert_eq!(docids, vec![0, 2, 3]); + + let count_multiples = + |range_query: RangeQuery| searcher.search(&range_query, &Count).unwrap(); + + assert_eq!( + count_multiples(RangeQuery::new_date( + date_field, + DateTime::from_utc(first_time_stamp)..DateTime::from_utc(two_secs_ahead) + )), + 2 + ); + + Ok(()) + } #[test] fn test_multifastfield_reader() -> crate::Result<()> { diff --git a/src/query/range_query/mod.rs b/src/query/range_query/mod.rs index a1f07f498..086d037b0 100644 --- a/src/query/range_query/mod.rs +++ b/src/query/range_query/mod.rs @@ -1,5 +1,6 @@ mod fast_field_range_query; mod range_query; mod range_query_ip_fastfield; +mod range_query_u64_fastfield; pub use self::range_query::RangeQuery; diff --git a/src/query/range_query/range_query.rs b/src/query/range_query/range_query.rs index 6ce2dbd70..8567859c0 100644 --- a/src/query/range_query/range_query.rs +++ b/src/query/range_query/range_query.rs @@ -1,8 +1,9 @@ use std::io; use std::ops::{Bound, Range}; -use common::BitSet; +use common::{BinarySerializable, BitSet}; +use super::range_query_u64_fastfield::FastFieldRangeWeight; use crate::core::SegmentReader; use crate::error::TantivyError; use crate::query::explanation::does_not_match; @@ -10,7 +11,7 @@ use crate::query::range_query::range_query_ip_fastfield::IPFastFieldRangeWeight; use crate::query::{BitSetDocSet, ConstScorer, EnableScoring, Explanation, Query, Scorer, Weight}; use crate::schema::{Field, IndexRecordOption, Term, Type}; use crate::termdict::{TermDictionary, TermStreamer}; -use crate::{DocId, Score}; +use crate::{DateTime, DocId, Score}; pub(crate) fn map_bound TTo>( bound: &Bound, @@ -203,6 +204,40 @@ impl RangeQuery { ) } + /// Create a new `RangeQuery` over a `date` field. + /// + /// The two `Bound` arguments make it possible to create more complex + /// ranges than semi-inclusive range. + /// + /// If the field is not of the type `date`, tantivy + /// will panic when the `Weight` object is created. + pub fn new_date_bounds( + field: Field, + left_bound: Bound, + right_bound: Bound, + ) -> RangeQuery { + let make_term_val = + |val: &DateTime| Term::from_field_date(field, *val).value_bytes().to_owned(); + RangeQuery { + field, + value_type: Type::Date, + left_bound: map_bound(&left_bound, &make_term_val), + right_bound: map_bound(&right_bound, &make_term_val), + } + } + + /// Create a new `RangeQuery` over a `date` field. + /// + /// If the field is not of the type `date`, tantivy + /// will panic when the `Weight` object is created. + pub fn new_date(field: Field, range: Range) -> RangeQuery { + RangeQuery::new_date_bounds( + field, + Bound::Included(range.start), + Bound::Excluded(range.end), + ) + } + /// Create a new `RangeQuery` over a `Str` field. /// /// The two `Bound` arguments make it possible to create more complex @@ -252,6 +287,23 @@ impl RangeQuery { } } +fn is_type_valid_for_fastfield_range_query(typ: Type) -> bool { + match typ { + Type::U64 | Type::I64 | Type::F64 | Type::Bool | Type::Date => true, + Type::IpAddr => true, + Type::Str | Type::Facet | Type::Bytes | Type::Json => false, + } +} + +/// Returns true if the type maps to a u64 fast field +pub(crate) fn maps_to_u64_fastfield(typ: Type) -> bool { + match typ { + Type::U64 | Type::I64 | Type::F64 | Type::Bool | Type::Date => true, + Type::IpAddr => false, + Type::Str | Type::Facet | Type::Bytes | Type::Json => false, + } +} + impl Query for RangeQuery { fn weight(&self, enable_scoring: EnableScoring<'_>) -> crate::Result> { let schema = enable_scoring.schema(); @@ -265,12 +317,29 @@ impl Query for RangeQuery { return Err(TantivyError::SchemaError(err_msg)); } - if field_type.is_ip_addr() && field_type.is_fast() { - Ok(Box::new(IPFastFieldRangeWeight::new( - self.field, - &self.left_bound, - &self.right_bound, - ))) + if field_type.is_fast() && is_type_valid_for_fastfield_range_query(self.value_type) { + if field_type.is_ip_addr() { + Ok(Box::new(IPFastFieldRangeWeight::new( + self.field, + &self.left_bound, + &self.right_bound, + ))) + } else { + // We run the range query on u64 value space for performance reasons and simpicity + // assert the type maps to u64 + assert!(maps_to_u64_fastfield(self.value_type)); + let parse_from_bytes = |data: &Vec| { + u64::from_be(BinarySerializable::deserialize(&mut &data[..]).unwrap()) + }; + + let left_bound = map_bound(&self.left_bound, &parse_from_bytes); + let right_bound = map_bound(&self.right_bound, &parse_from_bytes); + Ok(Box::new(FastFieldRangeWeight::new( + self.field, + left_bound, + right_bound, + ))) + } } else { Ok(Box::new(RangeWeight { field: self.field, diff --git a/src/query/range_query/range_query_ip_fastfield.rs b/src/query/range_query/range_query_ip_fastfield.rs index 90e22c858..b97d1bb2f 100644 --- a/src/query/range_query/range_query_ip_fastfield.rs +++ b/src/query/range_query/range_query_ip_fastfield.rs @@ -23,13 +23,13 @@ pub struct IPFastFieldRangeWeight { impl IPFastFieldRangeWeight { pub fn new(field: Field, left_bound: &Bound>, right_bound: &Bound>) -> Self { - let ip_from_bound_raw_data = |data: &Vec| { - let left_ip_u128: u128 = + let parse_ip_from_bytes = |data: &Vec| { + let ip_u128: u128 = u128::from_be(BinarySerializable::deserialize(&mut &data[..]).unwrap()); - Ipv6Addr::from_u128(left_ip_u128) + Ipv6Addr::from_u128(ip_u128) }; - let left_bound = map_bound(left_bound, &ip_from_bound_raw_data); - let right_bound = map_bound(right_bound, &ip_from_bound_raw_data); + let left_bound = map_bound(left_bound, &parse_ip_from_bytes); + let right_bound = map_bound(right_bound, &parse_ip_from_bytes); Self { field, left_bound, diff --git a/src/query/range_query/range_query_u64_fastfield.rs b/src/query/range_query/range_query_u64_fastfield.rs new file mode 100644 index 000000000..f9e7aac9d --- /dev/null +++ b/src/query/range_query/range_query_u64_fastfield.rs @@ -0,0 +1,558 @@ +//! Fastfields support efficient scanning for range queries. +//! We use this variant only if the fastfield exists, otherwise the default in `range_query` is +//! used, which uses the term dictionary + postings. + +use std::ops::{Bound, RangeInclusive}; + +use fastfield_codecs::MonotonicallyMappableToU64; + +use super::fast_field_range_query::{FastFieldCardinality, RangeDocSet}; +use super::range_query::map_bound; +use crate::query::{ConstScorer, Explanation, Scorer, Weight}; +use crate::schema::{Cardinality, Field}; +use crate::{DocId, DocSet, Score, SegmentReader, TantivyError}; + +/// `FastFieldRangeWeight` uses the fast field to execute range queries. +pub struct FastFieldRangeWeight { + field: Field, + left_bound: Bound, + right_bound: Bound, +} + +impl FastFieldRangeWeight { + pub fn new(field: Field, left_bound: Bound, right_bound: Bound) -> Self { + let left_bound = map_bound(&left_bound, &|val| *val); + let right_bound = map_bound(&right_bound, &|val| *val); + Self { + field, + left_bound, + right_bound, + } + } +} + +impl Weight for FastFieldRangeWeight { + fn scorer(&self, reader: &SegmentReader, boost: Score) -> crate::Result> { + let field_type = reader.schema().get_field_entry(self.field).field_type(); + match field_type.fastfield_cardinality().unwrap() { + Cardinality::SingleValue => { + let fast_field = reader.fast_fields().u64_lenient(self.field)?; + let value_range = bound_to_value_range( + &self.left_bound, + &self.right_bound, + fast_field.min_value(), + fast_field.max_value(), + ); + let docset = + RangeDocSet::new(value_range, FastFieldCardinality::SingleValue(fast_field)); + Ok(Box::new(ConstScorer::new(docset, boost))) + } + Cardinality::MultiValues => { + let fast_field = reader.fast_fields().u64s_lenient(self.field)?; + let value_range = bound_to_value_range( + &self.left_bound, + &self.right_bound, + fast_field.min_value(), + fast_field.max_value(), + ); + let docset = + RangeDocSet::new(value_range, FastFieldCardinality::MultiValue(fast_field)); + Ok(Box::new(ConstScorer::new(docset, boost))) + } + } + } + + fn explain(&self, reader: &SegmentReader, doc: DocId) -> crate::Result { + let mut scorer = self.scorer(reader, 1.0)?; + if scorer.seek(doc) != doc { + return Err(TantivyError::InvalidArgument(format!( + "Document #({}) does not match", + doc + ))); + } + let explanation = Explanation::new("Const", scorer.score()); + + Ok(explanation) + } +} + +fn bound_to_value_range( + left_bound: &Bound, + right_bound: &Bound, + min_value: T, + max_value: T, +) -> RangeInclusive { + let start_value = match left_bound { + Bound::Included(val) => *val, + Bound::Excluded(val) => T::from_u64(val.to_u64() + 1), + Bound::Unbounded => min_value, + }; + + let end_value = match right_bound { + Bound::Included(val) => *val, + Bound::Excluded(val) => T::from_u64(val.to_u64() - 1), + Bound::Unbounded => max_value, + }; + start_value..=end_value +} + +#[cfg(test)] +mod tests { + use proptest::prelude::ProptestConfig; + use proptest::strategy::Strategy; + use proptest::{prop_oneof, proptest}; + use rand::rngs::StdRng; + use rand::seq::SliceRandom; + use rand::SeedableRng; + + use super::*; + use crate::collector::Count; + use crate::query::QueryParser; + use crate::schema::{NumericOptions, Schema, FAST, INDEXED, STORED, STRING}; + use crate::Index; + + #[derive(Clone, Debug)] + pub struct Doc { + pub id_name: String, + pub id: u64, + } + + fn operation_strategy() -> impl Strategy { + prop_oneof![ + (0u64..10_000u64).prop_map(doc_from_id_1), + (1u64..10_000u64).prop_map(doc_from_id_2), + ] + } + + pub fn doc_from_id_1(id: u64) -> Doc { + let id = id * 1000; + Doc { + id_name: id.to_string(), + id, + } + } + fn doc_from_id_2(id: u64) -> Doc { + let id = id * 1000; + Doc { + id_name: (id - 1).to_string(), + id, + } + } + + proptest! { + #![proptest_config(ProptestConfig::with_cases(10))] + #[test] + fn test_range_for_docs_prop(ops in proptest::collection::vec(operation_strategy(), 1..1000)) { + assert!(test_id_range_for_docs(ops).is_ok()); + } + } + + #[test] + fn range_regression1_test() { + let ops = vec![doc_from_id_1(0)]; + assert!(test_id_range_for_docs(ops).is_ok()); + } + + #[test] + fn range_regression2_test() { + let ops = vec![ + doc_from_id_1(52), + doc_from_id_1(63), + doc_from_id_1(12), + doc_from_id_2(91), + doc_from_id_2(33), + ]; + assert!(test_id_range_for_docs(ops).is_ok()); + } + + #[test] + fn range_regression3_test() { + let ops = vec![doc_from_id_1(1), doc_from_id_1(2), doc_from_id_1(3)]; + assert!(test_id_range_for_docs(ops).is_ok()); + } + + #[test] + fn range_regression4_test() { + let ops = vec![doc_from_id_2(100)]; + assert!(test_id_range_for_docs(ops).is_ok()); + } + + pub fn create_index_from_docs(docs: &[Doc]) -> Index { + let mut schema_builder = Schema::builder(); + let id_u64_field = schema_builder.add_u64_field("id", INDEXED | STORED | FAST); + let ids_u64_field = schema_builder.add_u64_field( + "ids", + NumericOptions::default() + .set_fast(Cardinality::MultiValues) + .set_indexed(), + ); + + let id_f64_field = schema_builder.add_f64_field("id_f64", INDEXED | STORED | FAST); + let ids_f64_field = schema_builder.add_f64_field( + "ids_f64", + NumericOptions::default() + .set_fast(Cardinality::MultiValues) + .set_indexed(), + ); + + let id_i64_field = schema_builder.add_i64_field("id_i64", INDEXED | STORED | FAST); + let ids_i64_field = schema_builder.add_i64_field( + "ids_i64", + NumericOptions::default() + .set_fast(Cardinality::MultiValues) + .set_indexed(), + ); + + let text_field = schema_builder.add_text_field("id_name", STRING | STORED); + let schema = schema_builder.build(); + let index = Index::create_in_ram(schema); + + { + let mut index_writer = index.writer(3_000_000).unwrap(); + for doc in docs.iter() { + index_writer + .add_document(doc!( + ids_i64_field => doc.id as i64, + ids_i64_field => doc.id as i64, + ids_f64_field => doc.id as f64, + ids_f64_field => doc.id as f64, + ids_u64_field => doc.id, + ids_u64_field => doc.id, + id_u64_field => doc.id, + id_f64_field => doc.id as f64, + id_i64_field => doc.id as i64, + text_field => doc.id_name.to_string(), + )) + .unwrap(); + } + + index_writer.commit().unwrap(); + } + index + } + + fn test_id_range_for_docs(docs: Vec) -> crate::Result<()> { + let index = create_index_from_docs(&docs); + let reader = index.reader().unwrap(); + let searcher = reader.searcher(); + + let mut rng: StdRng = StdRng::from_seed([1u8; 32]); + + let get_num_hits = |query| searcher.search(&query, &(Count)).unwrap(); + let query_from_text = |text: &str| { + QueryParser::for_index(&index, vec![]) + .parse_query(text) + .unwrap() + }; + + let gen_query_inclusive = |field: &str, from: u64, to: u64| { + format!("{}:[{} TO {}]", field, &from.to_string(), &to.to_string()) + }; + + let test_sample = |sample_docs: Vec| { + let mut ids: Vec = sample_docs.iter().map(|doc| doc.id).collect(); + ids.sort(); + let expected_num_hits = docs + .iter() + .filter(|doc| (ids[0]..=ids[1]).contains(&doc.id)) + .count(); + + let query = gen_query_inclusive("id", ids[0], ids[1]); + assert_eq!(get_num_hits(query_from_text(&query)), expected_num_hits); + + let query = gen_query_inclusive("ids", ids[0], ids[1]); + assert_eq!(get_num_hits(query_from_text(&query)), expected_num_hits); + + // Intersection search + let id_filter = sample_docs[0].id_name.to_string(); + let expected_num_hits = docs + .iter() + .filter(|doc| (ids[0]..=ids[1]).contains(&doc.id) && doc.id_name == id_filter) + .count(); + let query = format!( + "{} AND id_name:{}", + gen_query_inclusive("id", ids[0], ids[1]), + &id_filter + ); + assert_eq!(get_num_hits(query_from_text(&query)), expected_num_hits); + let query = format!( + "{} AND id_name:{}", + gen_query_inclusive("id_f64", ids[0], ids[1]), + &id_filter + ); + assert_eq!(get_num_hits(query_from_text(&query)), expected_num_hits); + let query = format!( + "{} AND id_name:{}", + gen_query_inclusive("id_i64", ids[0], ids[1]), + &id_filter + ); + assert_eq!(get_num_hits(query_from_text(&query)), expected_num_hits); + + // Intersection search on multivalue id field + let id_filter = sample_docs[0].id_name.to_string(); + let query = format!( + "{} AND id_name:{}", + gen_query_inclusive("ids", ids[0], ids[1]), + &id_filter + ); + assert_eq!(get_num_hits(query_from_text(&query)), expected_num_hits); + let query = format!( + "{} AND id_name:{}", + gen_query_inclusive("ids_f64", ids[0], ids[1]), + &id_filter + ); + assert_eq!(get_num_hits(query_from_text(&query)), expected_num_hits); + let query = format!( + "{} AND id_name:{}", + gen_query_inclusive("ids_i64", ids[0], ids[1]), + &id_filter + ); + assert_eq!(get_num_hits(query_from_text(&query)), expected_num_hits); + }; + + test_sample(vec![docs[0].clone(), docs[0].clone()]); + + let samples: Vec<_> = docs.choose_multiple(&mut rng, 3).collect(); + + if samples.len() > 1 { + test_sample(vec![samples[0].clone(), samples[1].clone()]); + test_sample(vec![samples[1].clone(), samples[1].clone()]); + } + if samples.len() > 2 { + test_sample(vec![samples[1].clone(), samples[2].clone()]); + } + + Ok(()) + } +} + +#[cfg(all(test, feature = "unstable"))] +mod bench { + + use rand::rngs::StdRng; + use rand::{Rng, SeedableRng}; + use test::Bencher; + + use super::tests::*; + use super::*; + use crate::collector::Count; + use crate::query::QueryParser; + use crate::Index; + + fn get_index_0_to_100() -> Index { + let mut rng = StdRng::from_seed([1u8; 32]); + let num_vals = 100_000; + let docs: Vec<_> = (0..num_vals) + .map(|_i| { + let id_name = if rng.gen_bool(0.01) { + "veryfew".to_string() // 1% + } else if rng.gen_bool(0.1) { + "few".to_string() // 9% + } else { + "many".to_string() // 90% + }; + Doc { + id_name, + id: rng.gen_range(0..100), + } + }) + .collect(); + + let index = create_index_from_docs(&docs); + index + } + + fn get_90_percent() -> RangeInclusive { + 0..=90 + } + + fn get_10_percent() -> RangeInclusive { + 0..=10 + } + + fn get_1_percent() -> RangeInclusive { + 10..=10 + } + + fn excute_query( + field: &str, + id_range: RangeInclusive, + suffix: &str, + index: &Index, + ) -> usize { + let gen_query_inclusive = |from: &u64, to: &u64| { + format!( + "{}:[{} TO {}] {}", + field, + &from.to_string(), + &to.to_string(), + suffix + ) + }; + + let query = gen_query_inclusive(id_range.start(), id_range.end()); + let query_from_text = |text: &str| { + QueryParser::for_index(&index, vec![]) + .parse_query(text) + .unwrap() + }; + let query = query_from_text(&query); + let reader = index.reader().unwrap(); + let searcher = reader.searcher(); + searcher.search(&query, &(Count)).unwrap() + } + + #[bench] + fn bench_id_range_hit_90_percent(bench: &mut Bencher) { + let index = get_index_0_to_100(); + + bench.iter(|| excute_query("id", get_90_percent(), "", &index)); + } + + #[bench] + fn bench_id_range_hit_10_percent(bench: &mut Bencher) { + let index = get_index_0_to_100(); + + bench.iter(|| excute_query("id", get_10_percent(), "", &index)); + } + + #[bench] + fn bench_id_range_hit_1_percent(bench: &mut Bencher) { + let index = get_index_0_to_100(); + + bench.iter(|| excute_query("id", get_1_percent(), "", &index)); + } + + #[bench] + fn bench_id_range_hit_10_percent_intersect_with_10_percent(bench: &mut Bencher) { + let index = get_index_0_to_100(); + + bench.iter(|| excute_query("id", get_10_percent(), "AND id_name:few", &index)); + } + + #[bench] + fn bench_id_range_hit_1_percent_intersect_with_10_percent(bench: &mut Bencher) { + let index = get_index_0_to_100(); + + bench.iter(|| excute_query("id", get_1_percent(), "AND id_name:few", &index)); + } + + #[bench] + fn bench_id_range_hit_1_percent_intersect_with_90_percent(bench: &mut Bencher) { + let index = get_index_0_to_100(); + + bench.iter(|| excute_query("id", get_1_percent(), "AND id_name:many", &index)); + } + + #[bench] + fn bench_id_range_hit_1_percent_intersect_with_1_percent(bench: &mut Bencher) { + let index = get_index_0_to_100(); + + bench.iter(|| excute_query("id", get_1_percent(), "AND id_name:veryfew", &index)); + } + + #[bench] + fn bench_id_range_hit_10_percent_intersect_with_90_percent(bench: &mut Bencher) { + let index = get_index_0_to_100(); + + bench.iter(|| excute_query("id", get_10_percent(), "AND id_name:many", &index)); + } + + #[bench] + fn bench_id_range_hit_90_percent_intersect_with_90_percent(bench: &mut Bencher) { + let index = get_index_0_to_100(); + + bench.iter(|| excute_query("id", get_90_percent(), "AND id_name:many", &index)); + } + + #[bench] + fn bench_id_range_hit_90_percent_intersect_with_10_percent(bench: &mut Bencher) { + let index = get_index_0_to_100(); + + bench.iter(|| excute_query("id", get_90_percent(), "AND id_name:few", &index)); + } + + #[bench] + fn bench_id_range_hit_90_percent_intersect_with_1_percent(bench: &mut Bencher) { + let index = get_index_0_to_100(); + + bench.iter(|| excute_query("id", get_90_percent(), "AND id_name:veryfew", &index)); + } + + #[bench] + fn bench_id_range_hit_90_percent_multi(bench: &mut Bencher) { + let index = get_index_0_to_100(); + + bench.iter(|| excute_query("ids", get_90_percent(), "", &index)); + } + + #[bench] + fn bench_id_range_hit_10_percent_multi(bench: &mut Bencher) { + let index = get_index_0_to_100(); + + bench.iter(|| excute_query("ids", get_10_percent(), "", &index)); + } + + #[bench] + fn bench_id_range_hit_1_percent_multi(bench: &mut Bencher) { + let index = get_index_0_to_100(); + + bench.iter(|| excute_query("ids", get_1_percent(), "", &index)); + } + + #[bench] + fn bench_id_range_hit_10_percent_intersect_with_10_percent_multi(bench: &mut Bencher) { + let index = get_index_0_to_100(); + + bench.iter(|| excute_query("ids", get_10_percent(), "AND id_name:few", &index)); + } + + #[bench] + fn bench_id_range_hit_1_percent_intersect_with_10_percent_multi(bench: &mut Bencher) { + let index = get_index_0_to_100(); + + bench.iter(|| excute_query("ids", get_1_percent(), "AND id_name:few", &index)); + } + + #[bench] + fn bench_id_range_hit_1_percent_intersect_with_90_percent_multi(bench: &mut Bencher) { + let index = get_index_0_to_100(); + + bench.iter(|| excute_query("ids", get_1_percent(), "AND id_name:many", &index)); + } + + #[bench] + fn bench_id_range_hit_1_percent_intersect_with_1_percent_multi(bench: &mut Bencher) { + let index = get_index_0_to_100(); + + bench.iter(|| excute_query("ids", get_1_percent(), "AND id_name:veryfew", &index)); + } + + #[bench] + fn bench_id_range_hit_10_percent_intersect_with_90_percent_multi(bench: &mut Bencher) { + let index = get_index_0_to_100(); + + bench.iter(|| excute_query("ids", get_10_percent(), "AND id_name:many", &index)); + } + + #[bench] + fn bench_id_range_hit_90_percent_intersect_with_90_percent_multi(bench: &mut Bencher) { + let index = get_index_0_to_100(); + + bench.iter(|| excute_query("ids", get_90_percent(), "AND id_name:many", &index)); + } + + #[bench] + fn bench_id_range_hit_90_percent_intersect_with_10_percent_multi(bench: &mut Bencher) { + let index = get_index_0_to_100(); + + bench.iter(|| excute_query("ids", get_90_percent(), "AND id_name:few", &index)); + } + + #[bench] + fn bench_id_range_hit_90_percent_intersect_with_1_percent_multi(bench: &mut Bencher) { + let index = get_index_0_to_100(); + + bench.iter(|| excute_query("ids", get_90_percent(), "AND id_name:veryfew", &index)); + } +} diff --git a/src/termdict/fst_termdict/merger.rs b/src/termdict/fst_termdict/merger.rs index e9241f916..83361a908 100644 --- a/src/termdict/fst_termdict/merger.rs +++ b/src/termdict/fst_termdict/merger.rs @@ -113,7 +113,7 @@ mod bench { } /// Create a dictionary of random strings. - fn rand_dict(num_terms: usize) -> crate::Result { + fn rand_dict(num_terms: usize) -> std::io::Result { let buffer: Vec = { let mut terms = vec![]; for _i in 0..num_terms {