mirror of
https://github.com/quickwit-oss/tantivy.git
synced 2025-12-23 02:29:57 +00:00
refactor fast field query (#2452)
As preparation of #2023 and #1709 * Use Term to pass parameters * merge u64 and ip fast field range query Side note: I did not rename range_query_u64_fastfield, because then git can't track the changes.
This commit is contained in:
@@ -1,3 +1,5 @@
|
||||
use std::ops::Bound;
|
||||
|
||||
// # Searching a range on an indexed int field.
|
||||
//
|
||||
// Below is an example of creating an indexed integer field in your schema
|
||||
@@ -5,7 +7,7 @@
|
||||
use tantivy::collector::Count;
|
||||
use tantivy::query::RangeQuery;
|
||||
use tantivy::schema::{Schema, INDEXED};
|
||||
use tantivy::{doc, Index, IndexWriter, Result};
|
||||
use tantivy::{doc, Index, IndexWriter, Result, Term};
|
||||
|
||||
fn main() -> Result<()> {
|
||||
// For the sake of simplicity, this schema will only have 1 field
|
||||
@@ -27,7 +29,10 @@ fn main() -> Result<()> {
|
||||
reader.reload()?;
|
||||
let searcher = reader.searcher();
|
||||
// The end is excluded i.e. here we are searching up to 1969
|
||||
let docs_in_the_sixties = RangeQuery::new_u64("year".to_string(), 1960..1970);
|
||||
let docs_in_the_sixties = RangeQuery::new(
|
||||
Bound::Included(Term::from_field_u64(year_field, 1960)),
|
||||
Bound::Excluded(Term::from_field_u64(year_field, 1970)),
|
||||
);
|
||||
// Uses a Count collector to sum the total number of docs in the range
|
||||
let num_60s_books = searcher.search(&docs_in_the_sixties, &Count)?;
|
||||
assert_eq!(num_60s_books, 10);
|
||||
|
||||
@@ -303,7 +303,7 @@ mod tests_mmap {
|
||||
Type::Str,
|
||||
),
|
||||
(format!("{field_name_out_internal}a"), Type::Str),
|
||||
(format!("{field_name_out_internal}"), Type::Str),
|
||||
(field_name_out_internal.to_string(), Type::Str),
|
||||
(format!("num{field_name_out_internal}"), Type::I64),
|
||||
];
|
||||
expected_fields.sort();
|
||||
|
||||
@@ -22,10 +22,7 @@ pub struct AllWeight;
|
||||
|
||||
impl Weight for AllWeight {
|
||||
fn scorer(&self, reader: &SegmentReader, boost: Score) -> crate::Result<Box<dyn Scorer>> {
|
||||
let all_scorer = AllScorer {
|
||||
doc: 0u32,
|
||||
max_doc: reader.max_doc(),
|
||||
};
|
||||
let all_scorer = AllScorer::new(reader.max_doc());
|
||||
Ok(Box::new(BoostScorer::new(all_scorer, boost)))
|
||||
}
|
||||
|
||||
@@ -43,6 +40,13 @@ pub struct AllScorer {
|
||||
max_doc: DocId,
|
||||
}
|
||||
|
||||
impl AllScorer {
|
||||
/// Creates a new AllScorer with `max_doc` docs.
|
||||
pub fn new(max_doc: DocId) -> AllScorer {
|
||||
AllScorer { doc: 0u32, max_doc }
|
||||
}
|
||||
}
|
||||
|
||||
impl DocSet for AllScorer {
|
||||
#[inline(always)]
|
||||
fn advance(&mut self) -> DocId {
|
||||
|
||||
@@ -192,7 +192,7 @@ mod tests {
|
||||
.cloned()
|
||||
.map(VecDocSet::from)
|
||||
.map(|d| ConstScorer::new(d, 1.0)),
|
||||
DoNothingCombiner::default(),
|
||||
DoNothingCombiner,
|
||||
min_match,
|
||||
)
|
||||
};
|
||||
|
||||
@@ -149,7 +149,7 @@ mod tests {
|
||||
use crate::query::exist_query::ExistsQuery;
|
||||
use crate::query::{BooleanQuery, RangeQuery};
|
||||
use crate::schema::{Facet, FacetOptions, Schema, FAST, INDEXED, STRING, TEXT};
|
||||
use crate::{Index, Searcher};
|
||||
use crate::{Index, Searcher, Term};
|
||||
|
||||
#[test]
|
||||
fn test_exists_query_simple() -> crate::Result<()> {
|
||||
@@ -188,9 +188,8 @@ mod tests {
|
||||
|
||||
// exercise seek
|
||||
let query = BooleanQuery::intersection(vec![
|
||||
Box::new(RangeQuery::new_u64_bounds(
|
||||
"all".to_string(),
|
||||
Bound::Included(50),
|
||||
Box::new(RangeQuery::new(
|
||||
Bound::Included(Term::from_field_u64(all_field, 50)),
|
||||
Bound::Unbounded,
|
||||
)),
|
||||
Box::new(ExistsQuery::new_exists_query("even".to_string())),
|
||||
@@ -198,10 +197,9 @@ mod tests {
|
||||
assert_eq!(searcher.search(&query, &Count)?, 25);
|
||||
|
||||
let query = BooleanQuery::intersection(vec![
|
||||
Box::new(RangeQuery::new_u64_bounds(
|
||||
"all".to_string(),
|
||||
Bound::Included(0),
|
||||
Bound::Excluded(50),
|
||||
Box::new(RangeQuery::new(
|
||||
Bound::Included(Term::from_field_u64(all_field, 0)),
|
||||
Bound::Included(Term::from_field_u64(all_field, 50)),
|
||||
)),
|
||||
Box::new(ExistsQuery::new_exists_query("odd".to_string())),
|
||||
]);
|
||||
|
||||
@@ -54,7 +54,7 @@ pub use self::phrase_prefix_query::PhrasePrefixQuery;
|
||||
pub use self::phrase_query::PhraseQuery;
|
||||
pub use self::query::{EnableScoring, Query, QueryClone};
|
||||
pub use self::query_parser::{QueryParser, QueryParserError};
|
||||
pub use self::range_query::{FastFieldRangeWeight, IPFastFieldRangeWeight, RangeQuery};
|
||||
pub use self::range_query::{FastFieldRangeWeight, RangeQuery};
|
||||
pub use self::regex_query::RegexQuery;
|
||||
pub use self::reqopt_scorer::RequiredOptionalScorer;
|
||||
pub use self::score_combiner::{
|
||||
|
||||
@@ -145,15 +145,7 @@ impl Query for PhrasePrefixQuery {
|
||||
Bound::Unbounded
|
||||
};
|
||||
|
||||
let mut range_query = RangeQuery::new_term_bounds(
|
||||
enable_scoring
|
||||
.schema()
|
||||
.get_field_name(self.field)
|
||||
.to_owned(),
|
||||
self.prefix.1.typ(),
|
||||
&Bound::Included(self.prefix.1.clone()),
|
||||
&end_term,
|
||||
);
|
||||
let mut range_query = RangeQuery::new(Bound::Included(self.prefix.1.clone()), end_term);
|
||||
range_query.limit(self.max_expansions as u64);
|
||||
range_query.weight(enable_scoring)
|
||||
}
|
||||
|
||||
@@ -2,7 +2,7 @@ use std::fmt;
|
||||
use std::ops::Bound;
|
||||
|
||||
use crate::query::Occur;
|
||||
use crate::schema::{Term, Type};
|
||||
use crate::schema::Term;
|
||||
use crate::Score;
|
||||
|
||||
#[derive(Clone)]
|
||||
@@ -14,8 +14,6 @@ pub enum LogicalLiteral {
|
||||
prefix: bool,
|
||||
},
|
||||
Range {
|
||||
field: String,
|
||||
value_type: Type,
|
||||
lower: Bound<Term>,
|
||||
upper: Bound<Term>,
|
||||
},
|
||||
|
||||
@@ -790,8 +790,6 @@ impl QueryParser {
|
||||
let (field, json_path) = try_tuple!(self
|
||||
.split_full_path(&full_path)
|
||||
.ok_or_else(|| QueryParserError::FieldDoesNotExist(full_path.clone())));
|
||||
let field_entry = self.schema.get_field_entry(field);
|
||||
let value_type = field_entry.field_type().value_type();
|
||||
let mut errors = Vec::new();
|
||||
let lower = match self.resolve_bound(field, json_path, &lower) {
|
||||
Ok(bound) => bound,
|
||||
@@ -812,12 +810,8 @@ impl QueryParser {
|
||||
// we failed to parse something. Either way, there is no point emiting it
|
||||
return (None, errors);
|
||||
}
|
||||
let logical_ast = LogicalAst::Leaf(Box::new(LogicalLiteral::Range {
|
||||
field: self.schema.get_field_name(field).to_string(),
|
||||
value_type,
|
||||
lower,
|
||||
upper,
|
||||
}));
|
||||
let logical_ast =
|
||||
LogicalAst::Leaf(Box::new(LogicalLiteral::Range { lower, upper }));
|
||||
(Some(logical_ast), errors)
|
||||
}
|
||||
UserInputLeaf::Set {
|
||||
@@ -884,14 +878,7 @@ fn convert_literal_to_query(
|
||||
Box::new(PhraseQuery::new_with_offset_and_slop(terms, slop))
|
||||
}
|
||||
}
|
||||
LogicalLiteral::Range {
|
||||
field,
|
||||
value_type,
|
||||
lower,
|
||||
upper,
|
||||
} => Box::new(RangeQuery::new_term_bounds(
|
||||
field, value_type, &lower, &upper,
|
||||
)),
|
||||
LogicalLiteral::Range { lower, upper } => Box::new(RangeQuery::new(lower, upper)),
|
||||
LogicalLiteral::Set { elements, .. } => Box::new(TermSetQuery::new(elements)),
|
||||
LogicalLiteral::All => Box::new(AllQuery),
|
||||
}
|
||||
@@ -1136,8 +1123,8 @@ mod test {
|
||||
let query = make_query_parser().parse_query("title:[A TO B]").unwrap();
|
||||
assert_eq!(
|
||||
format!("{query:?}"),
|
||||
"RangeQuery { field: \"title\", value_type: Str, lower_bound: Included([97]), \
|
||||
upper_bound: Included([98]), limit: None }"
|
||||
"RangeQuery { lower_bound: Included(Term(field=0, type=Str, \"a\")), upper_bound: \
|
||||
Included(Term(field=0, type=Str, \"b\")), limit: None }"
|
||||
);
|
||||
}
|
||||
|
||||
|
||||
@@ -180,10 +180,12 @@ impl<T: Send + Sync + PartialOrd + Copy + Debug + 'static> DocSet for RangeDocSe
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use std::ops::Bound;
|
||||
|
||||
use crate::collector::Count;
|
||||
use crate::directory::RamDirectory;
|
||||
use crate::query::RangeQuery;
|
||||
use crate::{schema, IndexBuilder, TantivyDocument};
|
||||
use crate::{schema, IndexBuilder, TantivyDocument, Term};
|
||||
|
||||
#[test]
|
||||
fn range_query_fast_optional_field_minimum() {
|
||||
@@ -218,10 +220,9 @@ mod tests {
|
||||
let reader = index.reader().unwrap();
|
||||
let searcher = reader.searcher();
|
||||
|
||||
let query = RangeQuery::new_u64_bounds(
|
||||
"score".to_string(),
|
||||
std::ops::Bound::Included(70),
|
||||
std::ops::Bound::Unbounded,
|
||||
let query = RangeQuery::new(
|
||||
Bound::Included(Term::from_field_u64(score_field, 70)),
|
||||
Bound::Unbounded,
|
||||
);
|
||||
|
||||
let count = searcher.search(&query, &Count).unwrap();
|
||||
@@ -2,13 +2,11 @@ use std::ops::Bound;
|
||||
|
||||
use crate::schema::Type;
|
||||
|
||||
mod fast_field_range_query;
|
||||
mod fast_field_range_doc_set;
|
||||
mod range_query;
|
||||
mod range_query_ip_fastfield;
|
||||
mod range_query_u64_fastfield;
|
||||
|
||||
pub use self::range_query::RangeQuery;
|
||||
pub use self::range_query_ip_fastfield::IPFastFieldRangeWeight;
|
||||
pub use self::range_query_u64_fastfield::FastFieldRangeWeight;
|
||||
|
||||
// TODO is this correct?
|
||||
|
||||
@@ -1,21 +1,17 @@
|
||||
use std::io;
|
||||
use std::net::Ipv6Addr;
|
||||
use std::ops::{Bound, Range};
|
||||
use std::ops::Bound;
|
||||
|
||||
use columnar::MonotonicallyMappableToU128;
|
||||
use common::{BinarySerializable, BitSet};
|
||||
use common::BitSet;
|
||||
|
||||
use super::map_bound;
|
||||
use super::range_query_u64_fastfield::FastFieldRangeWeight;
|
||||
use crate::error::TantivyError;
|
||||
use crate::index::SegmentReader;
|
||||
use crate::query::explanation::does_not_match;
|
||||
use crate::query::range_query::range_query_ip_fastfield::IPFastFieldRangeWeight;
|
||||
use crate::query::range_query::{is_type_valid_for_fastfield_range_query, map_bound_res};
|
||||
use crate::query::range_query::is_type_valid_for_fastfield_range_query;
|
||||
use crate::query::{BitSetDocSet, ConstScorer, EnableScoring, Explanation, Query, Scorer, Weight};
|
||||
use crate::schema::{Field, IndexRecordOption, Term, Type};
|
||||
use crate::termdict::{TermDictionary, TermStreamer};
|
||||
use crate::{DateTime, DocId, Score};
|
||||
use crate::{DocId, Score};
|
||||
|
||||
/// `RangeQuery` matches all documents that have at least one term within a defined range.
|
||||
///
|
||||
@@ -40,8 +36,10 @@ use crate::{DateTime, DocId, Score};
|
||||
/// ```rust
|
||||
/// use tantivy::collector::Count;
|
||||
/// use tantivy::query::RangeQuery;
|
||||
/// use tantivy::Term;
|
||||
/// use tantivy::schema::{Schema, INDEXED};
|
||||
/// use tantivy::{doc, Index, IndexWriter};
|
||||
/// use std::ops::Bound;
|
||||
/// # fn test() -> tantivy::Result<()> {
|
||||
/// let mut schema_builder = Schema::builder();
|
||||
/// let year_field = schema_builder.add_u64_field("year", INDEXED);
|
||||
@@ -59,7 +57,10 @@ use crate::{DateTime, DocId, Score};
|
||||
///
|
||||
/// let reader = index.reader()?;
|
||||
/// let searcher = reader.searcher();
|
||||
/// let docs_in_the_sixties = RangeQuery::new_u64("year".to_string(), 1960..1970);
|
||||
/// let docs_in_the_sixties = RangeQuery::new(
|
||||
/// Bound::Included(Term::from_field_u64(year_field, 1960)),
|
||||
/// Bound::Excluded(Term::from_field_u64(year_field, 1970)),
|
||||
/// );
|
||||
/// let num_60s_books = searcher.search(&docs_in_the_sixties, &Count)?;
|
||||
/// assert_eq!(num_60s_books, 2285);
|
||||
/// Ok(())
|
||||
@@ -68,246 +69,46 @@ use crate::{DateTime, DocId, Score};
|
||||
/// ```
|
||||
#[derive(Clone, Debug)]
|
||||
pub struct RangeQuery {
|
||||
field: String,
|
||||
value_type: Type,
|
||||
lower_bound: Bound<Vec<u8>>,
|
||||
upper_bound: Bound<Vec<u8>>,
|
||||
lower_bound: Bound<Term>,
|
||||
upper_bound: Bound<Term>,
|
||||
limit: Option<u64>,
|
||||
}
|
||||
|
||||
/// Returns the inner value of a `Bound`
|
||||
pub(crate) fn inner_bound(val: &Bound<Term>) -> Option<&Term> {
|
||||
match val {
|
||||
Bound::Included(term) | Bound::Excluded(term) => Some(term),
|
||||
Bound::Unbounded => None,
|
||||
}
|
||||
}
|
||||
|
||||
impl RangeQuery {
|
||||
/// Creates a new `RangeQuery` from bounded start and end terms.
|
||||
///
|
||||
/// If the value type is not correct, something may go terribly wrong when
|
||||
/// the `Weight` object is created.
|
||||
pub fn new_term_bounds(
|
||||
field: String,
|
||||
value_type: Type,
|
||||
lower_bound: &Bound<Term>,
|
||||
upper_bound: &Bound<Term>,
|
||||
) -> RangeQuery {
|
||||
let verify_and_unwrap_term = |val: &Term| val.serialized_value_bytes().to_owned();
|
||||
pub fn new(lower_bound: Bound<Term>, upper_bound: Bound<Term>) -> RangeQuery {
|
||||
RangeQuery {
|
||||
field,
|
||||
value_type,
|
||||
lower_bound: map_bound(lower_bound, verify_and_unwrap_term),
|
||||
upper_bound: map_bound(upper_bound, verify_and_unwrap_term),
|
||||
lower_bound,
|
||||
upper_bound,
|
||||
limit: None,
|
||||
}
|
||||
}
|
||||
|
||||
/// Creates a new `RangeQuery` over a `i64` field.
|
||||
///
|
||||
/// If the field is not of the type `i64`, tantivy
|
||||
/// will panic when the `Weight` object is created.
|
||||
pub fn new_i64(field: String, range: Range<i64>) -> RangeQuery {
|
||||
RangeQuery::new_i64_bounds(
|
||||
field,
|
||||
Bound::Included(range.start),
|
||||
Bound::Excluded(range.end),
|
||||
)
|
||||
}
|
||||
|
||||
/// Create a new `RangeQuery` over a `i64` field.
|
||||
///
|
||||
/// The two `Bound` arguments make it possible to create more complex
|
||||
/// ranges than semi-inclusive range.
|
||||
///
|
||||
/// If the field is not of the type `i64`, tantivy
|
||||
/// will panic when the `Weight` object is created.
|
||||
pub fn new_i64_bounds(
|
||||
field: String,
|
||||
lower_bound: Bound<i64>,
|
||||
upper_bound: Bound<i64>,
|
||||
) -> RangeQuery {
|
||||
let make_term_val = |val: &i64| {
|
||||
Term::from_field_i64(Field::from_field_id(0), *val)
|
||||
.serialized_value_bytes()
|
||||
.to_owned()
|
||||
};
|
||||
RangeQuery {
|
||||
field,
|
||||
value_type: Type::I64,
|
||||
lower_bound: map_bound(&lower_bound, make_term_val),
|
||||
upper_bound: map_bound(&upper_bound, make_term_val),
|
||||
limit: None,
|
||||
}
|
||||
}
|
||||
|
||||
/// Creates a new `RangeQuery` over a `f64` field.
|
||||
///
|
||||
/// If the field is not of the type `f64`, tantivy
|
||||
/// will panic when the `Weight` object is created.
|
||||
pub fn new_f64(field: String, range: Range<f64>) -> RangeQuery {
|
||||
RangeQuery::new_f64_bounds(
|
||||
field,
|
||||
Bound::Included(range.start),
|
||||
Bound::Excluded(range.end),
|
||||
)
|
||||
}
|
||||
|
||||
/// Create a new `RangeQuery` over a `f64` field.
|
||||
///
|
||||
/// The two `Bound` arguments make it possible to create more complex
|
||||
/// ranges than semi-inclusive range.
|
||||
///
|
||||
/// If the field is not of the type `f64`, tantivy
|
||||
/// will panic when the `Weight` object is created.
|
||||
pub fn new_f64_bounds(
|
||||
field: String,
|
||||
lower_bound: Bound<f64>,
|
||||
upper_bound: Bound<f64>,
|
||||
) -> RangeQuery {
|
||||
let make_term_val = |val: &f64| {
|
||||
Term::from_field_f64(Field::from_field_id(0), *val)
|
||||
.serialized_value_bytes()
|
||||
.to_owned()
|
||||
};
|
||||
RangeQuery {
|
||||
field,
|
||||
value_type: Type::F64,
|
||||
lower_bound: map_bound(&lower_bound, make_term_val),
|
||||
upper_bound: map_bound(&upper_bound, make_term_val),
|
||||
limit: None,
|
||||
}
|
||||
}
|
||||
|
||||
/// Create a new `RangeQuery` over a `u64` field.
|
||||
///
|
||||
/// The two `Bound` arguments make it possible to create more complex
|
||||
/// ranges than semi-inclusive range.
|
||||
///
|
||||
/// If the field is not of the type `u64`, tantivy
|
||||
/// will panic when the `Weight` object is created.
|
||||
pub fn new_u64_bounds(
|
||||
field: String,
|
||||
lower_bound: Bound<u64>,
|
||||
upper_bound: Bound<u64>,
|
||||
) -> RangeQuery {
|
||||
let make_term_val = |val: &u64| {
|
||||
Term::from_field_u64(Field::from_field_id(0), *val)
|
||||
.serialized_value_bytes()
|
||||
.to_owned()
|
||||
};
|
||||
RangeQuery {
|
||||
field,
|
||||
value_type: Type::U64,
|
||||
lower_bound: map_bound(&lower_bound, make_term_val),
|
||||
upper_bound: map_bound(&upper_bound, make_term_val),
|
||||
limit: None,
|
||||
}
|
||||
}
|
||||
|
||||
/// Create a new `RangeQuery` over a `ip` field.
|
||||
///
|
||||
/// If the field is not of the type `ip`, tantivy
|
||||
/// will panic when the `Weight` object is created.
|
||||
pub fn new_ip_bounds(
|
||||
field: String,
|
||||
lower_bound: Bound<Ipv6Addr>,
|
||||
upper_bound: Bound<Ipv6Addr>,
|
||||
) -> RangeQuery {
|
||||
let make_term_val = |val: &Ipv6Addr| {
|
||||
Term::from_field_ip_addr(Field::from_field_id(0), *val)
|
||||
.serialized_value_bytes()
|
||||
.to_owned()
|
||||
};
|
||||
RangeQuery {
|
||||
field,
|
||||
value_type: Type::IpAddr,
|
||||
lower_bound: map_bound(&lower_bound, make_term_val),
|
||||
upper_bound: map_bound(&upper_bound, make_term_val),
|
||||
limit: None,
|
||||
}
|
||||
}
|
||||
|
||||
/// Create a new `RangeQuery` over a `u64` field.
|
||||
///
|
||||
/// If the field is not of the type `u64`, tantivy
|
||||
/// will panic when the `Weight` object is created.
|
||||
pub fn new_u64(field: String, range: Range<u64>) -> RangeQuery {
|
||||
RangeQuery::new_u64_bounds(
|
||||
field,
|
||||
Bound::Included(range.start),
|
||||
Bound::Excluded(range.end),
|
||||
)
|
||||
}
|
||||
|
||||
/// Create a new `RangeQuery` over a `date` field.
|
||||
///
|
||||
/// The two `Bound` arguments make it possible to create more complex
|
||||
/// ranges than semi-inclusive range.
|
||||
///
|
||||
/// If the field is not of the type `date`, tantivy
|
||||
/// will panic when the `Weight` object is created.
|
||||
pub fn new_date_bounds(
|
||||
field: String,
|
||||
lower_bound: Bound<DateTime>,
|
||||
upper_bound: Bound<DateTime>,
|
||||
) -> RangeQuery {
|
||||
let make_term_val = |val: &DateTime| {
|
||||
Term::from_field_date(Field::from_field_id(0), *val)
|
||||
.serialized_value_bytes()
|
||||
.to_owned()
|
||||
};
|
||||
RangeQuery {
|
||||
field,
|
||||
value_type: Type::Date,
|
||||
lower_bound: map_bound(&lower_bound, make_term_val),
|
||||
upper_bound: map_bound(&upper_bound, make_term_val),
|
||||
limit: None,
|
||||
}
|
||||
}
|
||||
|
||||
/// Create a new `RangeQuery` over a `date` field.
|
||||
///
|
||||
/// If the field is not of the type `date`, tantivy
|
||||
/// will panic when the `Weight` object is created.
|
||||
pub fn new_date(field: String, range: Range<DateTime>) -> RangeQuery {
|
||||
RangeQuery::new_date_bounds(
|
||||
field,
|
||||
Bound::Included(range.start),
|
||||
Bound::Excluded(range.end),
|
||||
)
|
||||
}
|
||||
|
||||
/// Create a new `RangeQuery` over a `Str` field.
|
||||
///
|
||||
/// The two `Bound` arguments make it possible to create more complex
|
||||
/// ranges than semi-inclusive range.
|
||||
///
|
||||
/// If the field is not of the type `Str`, tantivy
|
||||
/// will panic when the `Weight` object is created.
|
||||
pub fn new_str_bounds(
|
||||
field: String,
|
||||
lower_bound: Bound<&str>,
|
||||
upper_bound: Bound<&str>,
|
||||
) -> RangeQuery {
|
||||
let make_term_val = |val: &&str| val.as_bytes().to_vec();
|
||||
RangeQuery {
|
||||
field,
|
||||
value_type: Type::Str,
|
||||
lower_bound: map_bound(&lower_bound, make_term_val),
|
||||
upper_bound: map_bound(&upper_bound, make_term_val),
|
||||
limit: None,
|
||||
}
|
||||
}
|
||||
|
||||
/// Create a new `RangeQuery` over a `Str` field.
|
||||
///
|
||||
/// If the field is not of the type `Str`, tantivy
|
||||
/// will panic when the `Weight` object is created.
|
||||
pub fn new_str(field: String, range: Range<&str>) -> RangeQuery {
|
||||
RangeQuery::new_str_bounds(
|
||||
field,
|
||||
Bound::Included(range.start),
|
||||
Bound::Excluded(range.end),
|
||||
)
|
||||
}
|
||||
|
||||
/// Field to search over
|
||||
pub fn field(&self) -> &str {
|
||||
&self.field
|
||||
pub fn field(&self) -> Field {
|
||||
self.get_term().field()
|
||||
}
|
||||
|
||||
/// The value type of the field
|
||||
pub fn value_type(&self) -> Type {
|
||||
self.get_term().typ()
|
||||
}
|
||||
|
||||
pub(crate) fn get_term(&self) -> &Term {
|
||||
inner_bound(&self.lower_bound)
|
||||
.or(inner_bound(&self.upper_bound))
|
||||
.expect("At least one bound must be set")
|
||||
}
|
||||
|
||||
/// Limit the number of term the `RangeQuery` will go through.
|
||||
@@ -319,70 +120,23 @@ impl RangeQuery {
|
||||
}
|
||||
}
|
||||
|
||||
/// Returns true if the type maps to a u64 fast field
|
||||
pub(crate) fn maps_to_u64_fastfield(typ: Type) -> bool {
|
||||
match typ {
|
||||
Type::U64 | Type::I64 | Type::F64 | Type::Bool | Type::Date => true,
|
||||
Type::IpAddr => false,
|
||||
Type::Str | Type::Facet | Type::Bytes | Type::Json => false,
|
||||
}
|
||||
}
|
||||
|
||||
impl Query for RangeQuery {
|
||||
fn weight(&self, enable_scoring: EnableScoring<'_>) -> crate::Result<Box<dyn Weight>> {
|
||||
let schema = enable_scoring.schema();
|
||||
let field_type = schema
|
||||
.get_field_entry(schema.get_field(&self.field)?)
|
||||
.field_type();
|
||||
let value_type = field_type.value_type();
|
||||
if value_type != self.value_type {
|
||||
let err_msg = format!(
|
||||
"Create a range query of the type {:?}, when the field given was of type \
|
||||
{value_type:?}",
|
||||
self.value_type
|
||||
);
|
||||
return Err(TantivyError::SchemaError(err_msg));
|
||||
}
|
||||
let field_type = schema.get_field_entry(self.field()).field_type();
|
||||
|
||||
if field_type.is_fast() && is_type_valid_for_fastfield_range_query(self.value_type) {
|
||||
if field_type.is_ip_addr() {
|
||||
let parse_ip_from_bytes = |data: &Vec<u8>| {
|
||||
let ip_u128_bytes: [u8; 16] = data.as_slice().try_into().map_err(|_| {
|
||||
crate::TantivyError::InvalidArgument(
|
||||
"Expected 8 bytes for ip address".to_string(),
|
||||
)
|
||||
})?;
|
||||
let ip_u128 = u128::from_be_bytes(ip_u128_bytes);
|
||||
crate::Result::<Ipv6Addr>::Ok(Ipv6Addr::from_u128(ip_u128))
|
||||
};
|
||||
let lower_bound = map_bound_res(&self.lower_bound, parse_ip_from_bytes)?;
|
||||
let upper_bound = map_bound_res(&self.upper_bound, parse_ip_from_bytes)?;
|
||||
Ok(Box::new(IPFastFieldRangeWeight::new(
|
||||
self.field.to_string(),
|
||||
lower_bound,
|
||||
upper_bound,
|
||||
)))
|
||||
} else {
|
||||
// We run the range query on u64 value space for performance reasons and simpicity
|
||||
// assert the type maps to u64
|
||||
assert!(maps_to_u64_fastfield(self.value_type));
|
||||
let parse_from_bytes = |data: &Vec<u8>| {
|
||||
u64::from_be(BinarySerializable::deserialize(&mut &data[..]).unwrap())
|
||||
};
|
||||
|
||||
let lower_bound = map_bound(&self.lower_bound, parse_from_bytes);
|
||||
let upper_bound = map_bound(&self.upper_bound, parse_from_bytes);
|
||||
Ok(Box::new(FastFieldRangeWeight::new_u64_lenient(
|
||||
self.field.to_string(),
|
||||
lower_bound,
|
||||
upper_bound,
|
||||
)))
|
||||
}
|
||||
if field_type.is_fast() && is_type_valid_for_fastfield_range_query(self.value_type()) {
|
||||
Ok(Box::new(FastFieldRangeWeight::new(
|
||||
self.field(),
|
||||
self.lower_bound.clone(),
|
||||
self.upper_bound.clone(),
|
||||
)))
|
||||
} else {
|
||||
let verify_and_unwrap_term = |val: &Term| val.serialized_value_bytes().to_owned();
|
||||
Ok(Box::new(RangeWeight {
|
||||
field: self.field.to_string(),
|
||||
lower_bound: self.lower_bound.clone(),
|
||||
upper_bound: self.upper_bound.clone(),
|
||||
field: self.field(),
|
||||
lower_bound: map_bound(&self.lower_bound, verify_and_unwrap_term),
|
||||
upper_bound: map_bound(&self.upper_bound, verify_and_unwrap_term),
|
||||
limit: self.limit,
|
||||
}))
|
||||
}
|
||||
@@ -390,7 +144,7 @@ impl Query for RangeQuery {
|
||||
}
|
||||
|
||||
pub struct RangeWeight {
|
||||
field: String,
|
||||
field: Field,
|
||||
lower_bound: Bound<Vec<u8>>,
|
||||
upper_bound: Bound<Vec<u8>>,
|
||||
limit: Option<u64>,
|
||||
@@ -423,7 +177,7 @@ impl Weight for RangeWeight {
|
||||
let max_doc = reader.max_doc();
|
||||
let mut doc_bitset = BitSet::with_max_value(max_doc);
|
||||
|
||||
let inverted_index = reader.inverted_index(reader.schema().get_field(&self.field)?)?;
|
||||
let inverted_index = reader.inverted_index(self.field)?;
|
||||
let term_dict = inverted_index.terms();
|
||||
let mut term_range = self.term_range(term_dict)?;
|
||||
let mut processed_count = 0;
|
||||
@@ -477,7 +231,7 @@ mod tests {
|
||||
use crate::schema::{
|
||||
Field, IntoIpv6Addr, Schema, TantivyDocument, FAST, INDEXED, STORED, TEXT,
|
||||
};
|
||||
use crate::{Index, IndexWriter};
|
||||
use crate::{Index, IndexWriter, Term};
|
||||
|
||||
#[test]
|
||||
fn test_range_query_simple() -> crate::Result<()> {
|
||||
@@ -499,7 +253,10 @@ mod tests {
|
||||
let reader = index.reader()?;
|
||||
let searcher = reader.searcher();
|
||||
|
||||
let docs_in_the_sixties = RangeQuery::new_u64("year".to_string(), 1960u64..1970u64);
|
||||
let docs_in_the_sixties = RangeQuery::new(
|
||||
Bound::Included(Term::from_field_u64(year_field, 1960)),
|
||||
Bound::Excluded(Term::from_field_u64(year_field, 1970)),
|
||||
);
|
||||
|
||||
// ... or `1960..=1969` if inclusive range is enabled.
|
||||
let count = searcher.search(&docs_in_the_sixties, &Count)?;
|
||||
@@ -530,7 +287,10 @@ mod tests {
|
||||
let reader = index.reader()?;
|
||||
let searcher = reader.searcher();
|
||||
|
||||
let mut docs_in_the_sixties = RangeQuery::new_u64("year".to_string(), 1960u64..1970u64);
|
||||
let mut docs_in_the_sixties = RangeQuery::new(
|
||||
Bound::Included(Term::from_field_u64(year_field, 1960)),
|
||||
Bound::Excluded(Term::from_field_u64(year_field, 1970)),
|
||||
);
|
||||
docs_in_the_sixties.limit(5);
|
||||
|
||||
// due to the limit and no docs in 1963, it's really only 1960..=1965
|
||||
@@ -575,29 +335,29 @@ mod tests {
|
||||
|range_query: RangeQuery| searcher.search(&range_query, &Count).unwrap();
|
||||
|
||||
assert_eq!(
|
||||
count_multiples(RangeQuery::new_i64("intfield".to_string(), 10..11)),
|
||||
count_multiples(RangeQuery::new(
|
||||
Bound::Included(Term::from_field_i64(int_field, 10)),
|
||||
Bound::Excluded(Term::from_field_i64(int_field, 11)),
|
||||
)),
|
||||
9
|
||||
);
|
||||
assert_eq!(
|
||||
count_multiples(RangeQuery::new_i64_bounds(
|
||||
"intfield".to_string(),
|
||||
Bound::Included(10),
|
||||
Bound::Included(11)
|
||||
count_multiples(RangeQuery::new(
|
||||
Bound::Included(Term::from_field_i64(int_field, 10)),
|
||||
Bound::Included(Term::from_field_i64(int_field, 11)),
|
||||
)),
|
||||
18
|
||||
);
|
||||
assert_eq!(
|
||||
count_multiples(RangeQuery::new_i64_bounds(
|
||||
"intfield".to_string(),
|
||||
Bound::Excluded(9),
|
||||
Bound::Included(10)
|
||||
count_multiples(RangeQuery::new(
|
||||
Bound::Excluded(Term::from_field_i64(int_field, 9)),
|
||||
Bound::Included(Term::from_field_i64(int_field, 10)),
|
||||
)),
|
||||
9
|
||||
);
|
||||
assert_eq!(
|
||||
count_multiples(RangeQuery::new_i64_bounds(
|
||||
"intfield".to_string(),
|
||||
Bound::Included(9),
|
||||
count_multiples(RangeQuery::new(
|
||||
Bound::Included(Term::from_field_i64(int_field, 9)),
|
||||
Bound::Unbounded
|
||||
)),
|
||||
91
|
||||
@@ -646,29 +406,29 @@ mod tests {
|
||||
|range_query: RangeQuery| searcher.search(&range_query, &Count).unwrap();
|
||||
|
||||
assert_eq!(
|
||||
count_multiples(RangeQuery::new_f64("floatfield".to_string(), 10.0..11.0)),
|
||||
count_multiples(RangeQuery::new(
|
||||
Bound::Included(Term::from_field_f64(float_field, 10.0)),
|
||||
Bound::Excluded(Term::from_field_f64(float_field, 11.0)),
|
||||
)),
|
||||
9
|
||||
);
|
||||
assert_eq!(
|
||||
count_multiples(RangeQuery::new_f64_bounds(
|
||||
"floatfield".to_string(),
|
||||
Bound::Included(10.0),
|
||||
Bound::Included(11.0)
|
||||
count_multiples(RangeQuery::new(
|
||||
Bound::Included(Term::from_field_f64(float_field, 10.0)),
|
||||
Bound::Included(Term::from_field_f64(float_field, 11.0)),
|
||||
)),
|
||||
18
|
||||
);
|
||||
assert_eq!(
|
||||
count_multiples(RangeQuery::new_f64_bounds(
|
||||
"floatfield".to_string(),
|
||||
Bound::Excluded(9.0),
|
||||
Bound::Included(10.0)
|
||||
count_multiples(RangeQuery::new(
|
||||
Bound::Excluded(Term::from_field_f64(float_field, 9.0)),
|
||||
Bound::Included(Term::from_field_f64(float_field, 10.0)),
|
||||
)),
|
||||
9
|
||||
);
|
||||
assert_eq!(
|
||||
count_multiples(RangeQuery::new_f64_bounds(
|
||||
"floatfield".to_string(),
|
||||
Bound::Included(9.0),
|
||||
count_multiples(RangeQuery::new(
|
||||
Bound::Included(Term::from_field_f64(float_field, 9.0)),
|
||||
Bound::Unbounded
|
||||
)),
|
||||
91
|
||||
|
||||
@@ -1,512 +0,0 @@
|
||||
//! IP Fastfields support efficient scanning for range queries.
|
||||
//! We use this variant only if the fastfield exists, otherwise the default in `range_query` is
|
||||
//! used, which uses the term dictionary + postings.
|
||||
|
||||
use std::net::Ipv6Addr;
|
||||
use std::ops::{Bound, RangeInclusive};
|
||||
|
||||
use columnar::{Column, MonotonicallyMappableToU128};
|
||||
|
||||
use crate::query::range_query::fast_field_range_query::RangeDocSet;
|
||||
use crate::query::{ConstScorer, EmptyScorer, Explanation, Scorer, Weight};
|
||||
use crate::{DocId, DocSet, Score, SegmentReader, TantivyError};
|
||||
|
||||
/// `IPFastFieldRangeWeight` uses the ip address fast field to execute range queries.
|
||||
pub struct IPFastFieldRangeWeight {
|
||||
field: String,
|
||||
lower_bound: Bound<Ipv6Addr>,
|
||||
upper_bound: Bound<Ipv6Addr>,
|
||||
}
|
||||
|
||||
impl IPFastFieldRangeWeight {
|
||||
/// Creates a new IPFastFieldRangeWeight.
|
||||
pub fn new(field: String, lower_bound: Bound<Ipv6Addr>, upper_bound: Bound<Ipv6Addr>) -> Self {
|
||||
Self {
|
||||
field,
|
||||
lower_bound,
|
||||
upper_bound,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl Weight for IPFastFieldRangeWeight {
|
||||
fn scorer(&self, reader: &SegmentReader, boost: Score) -> crate::Result<Box<dyn Scorer>> {
|
||||
let Some(ip_addr_column): Option<Column<Ipv6Addr>> =
|
||||
reader.fast_fields().column_opt(&self.field)?
|
||||
else {
|
||||
return Ok(Box::new(EmptyScorer));
|
||||
};
|
||||
let value_range = bound_to_value_range(
|
||||
&self.lower_bound,
|
||||
&self.upper_bound,
|
||||
ip_addr_column.min_value(),
|
||||
ip_addr_column.max_value(),
|
||||
);
|
||||
let docset = RangeDocSet::new(value_range, ip_addr_column);
|
||||
Ok(Box::new(ConstScorer::new(docset, boost)))
|
||||
}
|
||||
|
||||
fn explain(&self, reader: &SegmentReader, doc: DocId) -> crate::Result<Explanation> {
|
||||
let mut scorer = self.scorer(reader, 1.0)?;
|
||||
if scorer.seek(doc) != doc {
|
||||
return Err(TantivyError::InvalidArgument(format!(
|
||||
"Document #({doc}) does not match"
|
||||
)));
|
||||
}
|
||||
let explanation = Explanation::new("Const", scorer.score());
|
||||
Ok(explanation)
|
||||
}
|
||||
}
|
||||
|
||||
fn bound_to_value_range(
|
||||
lower_bound: &Bound<Ipv6Addr>,
|
||||
upper_bound: &Bound<Ipv6Addr>,
|
||||
min_value: Ipv6Addr,
|
||||
max_value: Ipv6Addr,
|
||||
) -> RangeInclusive<Ipv6Addr> {
|
||||
let start_value = match lower_bound {
|
||||
Bound::Included(ip_addr) => *ip_addr,
|
||||
Bound::Excluded(ip_addr) => Ipv6Addr::from(ip_addr.to_u128() + 1),
|
||||
Bound::Unbounded => min_value,
|
||||
};
|
||||
|
||||
let end_value = match upper_bound {
|
||||
Bound::Included(ip_addr) => *ip_addr,
|
||||
Bound::Excluded(ip_addr) => Ipv6Addr::from(ip_addr.to_u128() - 1),
|
||||
Bound::Unbounded => max_value,
|
||||
};
|
||||
start_value..=end_value
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
pub mod tests {
|
||||
use proptest::prelude::ProptestConfig;
|
||||
use proptest::strategy::Strategy;
|
||||
use proptest::{prop_oneof, proptest};
|
||||
|
||||
use super::*;
|
||||
use crate::collector::Count;
|
||||
use crate::query::QueryParser;
|
||||
use crate::schema::{Schema, FAST, INDEXED, STORED, STRING};
|
||||
use crate::{Index, IndexWriter};
|
||||
|
||||
#[derive(Clone, Debug)]
|
||||
pub struct Doc {
|
||||
pub id: String,
|
||||
pub ip: Ipv6Addr,
|
||||
}
|
||||
|
||||
fn operation_strategy() -> impl Strategy<Value = Doc> {
|
||||
prop_oneof![
|
||||
(0u64..10_000u64).prop_map(doc_from_id_1),
|
||||
(1u64..10_000u64).prop_map(doc_from_id_2),
|
||||
]
|
||||
}
|
||||
|
||||
pub fn doc_from_id_1(id: u64) -> Doc {
|
||||
let id = id * 1000;
|
||||
Doc {
|
||||
// ip != id
|
||||
id: id.to_string(),
|
||||
ip: Ipv6Addr::from_u128(id as u128),
|
||||
}
|
||||
}
|
||||
fn doc_from_id_2(id: u64) -> Doc {
|
||||
let id = id * 1000;
|
||||
Doc {
|
||||
// ip != id
|
||||
id: (id - 1).to_string(),
|
||||
ip: Ipv6Addr::from_u128(id as u128),
|
||||
}
|
||||
}
|
||||
|
||||
proptest! {
|
||||
#![proptest_config(ProptestConfig::with_cases(10))]
|
||||
#[test]
|
||||
fn test_ip_range_for_docs_prop(ops in proptest::collection::vec(operation_strategy(), 1..1000)) {
|
||||
assert!(test_ip_range_for_docs(&ops).is_ok());
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_ip_range_regression1() {
|
||||
let ops = &[doc_from_id_1(0)];
|
||||
assert!(test_ip_range_for_docs(ops).is_ok());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_ip_range_regression2() {
|
||||
let ops = &[
|
||||
doc_from_id_1(52),
|
||||
doc_from_id_1(63),
|
||||
doc_from_id_1(12),
|
||||
doc_from_id_2(91),
|
||||
doc_from_id_2(33),
|
||||
];
|
||||
assert!(test_ip_range_for_docs(ops).is_ok());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_ip_range_regression3() {
|
||||
let ops = &[doc_from_id_1(1), doc_from_id_1(2), doc_from_id_1(3)];
|
||||
assert!(test_ip_range_for_docs(ops).is_ok());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_ip_range_regression3_simple() {
|
||||
let mut schema_builder = Schema::builder();
|
||||
let ips_field = schema_builder.add_ip_addr_field("ips", FAST | INDEXED);
|
||||
let schema = schema_builder.build();
|
||||
let index = Index::create_in_ram(schema);
|
||||
let mut writer: IndexWriter = index.writer_for_tests().unwrap();
|
||||
let ip_addrs: Vec<Ipv6Addr> = [1000, 2000, 3000]
|
||||
.into_iter()
|
||||
.map(Ipv6Addr::from_u128)
|
||||
.collect();
|
||||
for &ip_addr in &ip_addrs {
|
||||
writer
|
||||
.add_document(doc!(ips_field=>ip_addr, ips_field=>ip_addr))
|
||||
.unwrap();
|
||||
}
|
||||
writer.commit().unwrap();
|
||||
let searcher = index.reader().unwrap().searcher();
|
||||
let range_weight = IPFastFieldRangeWeight {
|
||||
field: "ips".to_string(),
|
||||
lower_bound: Bound::Included(ip_addrs[1]),
|
||||
upper_bound: Bound::Included(ip_addrs[2]),
|
||||
};
|
||||
let count = range_weight.count(searcher.segment_reader(0)).unwrap();
|
||||
assert_eq!(count, 2);
|
||||
}
|
||||
|
||||
pub fn create_index_from_docs(docs: &[Doc]) -> Index {
|
||||
let mut schema_builder = Schema::builder();
|
||||
let ip_field = schema_builder.add_ip_addr_field("ip", STORED | FAST);
|
||||
let ips_field = schema_builder.add_ip_addr_field("ips", FAST | INDEXED);
|
||||
let text_field = schema_builder.add_text_field("id", STRING | STORED);
|
||||
let schema = schema_builder.build();
|
||||
let index = Index::create_in_ram(schema);
|
||||
|
||||
{
|
||||
let mut index_writer = index.writer_with_num_threads(2, 60_000_000).unwrap();
|
||||
for doc in docs.iter() {
|
||||
index_writer
|
||||
.add_document(doc!(
|
||||
ips_field => doc.ip,
|
||||
ips_field => doc.ip,
|
||||
ip_field => doc.ip,
|
||||
text_field => doc.id.to_string(),
|
||||
))
|
||||
.unwrap();
|
||||
}
|
||||
|
||||
index_writer.commit().unwrap();
|
||||
}
|
||||
index
|
||||
}
|
||||
|
||||
fn test_ip_range_for_docs(docs: &[Doc]) -> crate::Result<()> {
|
||||
let index = create_index_from_docs(docs);
|
||||
let reader = index.reader().unwrap();
|
||||
let searcher = reader.searcher();
|
||||
|
||||
let get_num_hits = |query| searcher.search(&query, &Count).unwrap();
|
||||
let query_from_text = |text: &str| {
|
||||
QueryParser::for_index(&index, vec![])
|
||||
.parse_query(text)
|
||||
.unwrap()
|
||||
};
|
||||
|
||||
let gen_query_inclusive = |field: &str, ip_range: &RangeInclusive<Ipv6Addr>| {
|
||||
format!("{field}:[{} TO {}]", ip_range.start(), ip_range.end())
|
||||
};
|
||||
|
||||
let test_sample = |sample_docs: &[Doc]| {
|
||||
let mut ips: Vec<Ipv6Addr> = sample_docs.iter().map(|doc| doc.ip).collect();
|
||||
ips.sort();
|
||||
let ip_range = ips[0]..=ips[1];
|
||||
let expected_num_hits = docs
|
||||
.iter()
|
||||
.filter(|doc| (ips[0]..=ips[1]).contains(&doc.ip))
|
||||
.count();
|
||||
|
||||
let query = gen_query_inclusive("ip", &ip_range);
|
||||
assert_eq!(get_num_hits(query_from_text(&query)), expected_num_hits);
|
||||
|
||||
let query = gen_query_inclusive("ips", &ip_range);
|
||||
assert_eq!(get_num_hits(query_from_text(&query)), expected_num_hits);
|
||||
|
||||
// Intersection search
|
||||
let id_filter = sample_docs[0].id.to_string();
|
||||
let expected_num_hits = docs
|
||||
.iter()
|
||||
.filter(|doc| ip_range.contains(&doc.ip) && doc.id == id_filter)
|
||||
.count();
|
||||
let query = format!(
|
||||
"{} AND id:{}",
|
||||
gen_query_inclusive("ip", &ip_range),
|
||||
&id_filter
|
||||
);
|
||||
assert_eq!(get_num_hits(query_from_text(&query)), expected_num_hits);
|
||||
|
||||
// Intersection search on multivalue ip field
|
||||
let id_filter = sample_docs[0].id.to_string();
|
||||
let query = format!(
|
||||
"{} AND id:{}",
|
||||
gen_query_inclusive("ips", &ip_range),
|
||||
&id_filter
|
||||
);
|
||||
assert_eq!(get_num_hits(query_from_text(&query)), expected_num_hits);
|
||||
};
|
||||
|
||||
test_sample(&[docs[0].clone(), docs[0].clone()]);
|
||||
if docs.len() > 1 {
|
||||
test_sample(&[docs[0].clone(), docs[1].clone()]);
|
||||
test_sample(&[docs[1].clone(), docs[1].clone()]);
|
||||
}
|
||||
if docs.len() > 2 {
|
||||
test_sample(&[docs[1].clone(), docs[2].clone()]);
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(all(test, feature = "unstable"))]
|
||||
mod bench {
|
||||
|
||||
use rand::rngs::StdRng;
|
||||
use rand::{Rng, SeedableRng};
|
||||
use test::Bencher;
|
||||
|
||||
use super::tests::*;
|
||||
use super::*;
|
||||
use crate::collector::Count;
|
||||
use crate::query::QueryParser;
|
||||
use crate::Index;
|
||||
|
||||
fn get_index_0_to_100() -> Index {
|
||||
let mut rng = StdRng::from_seed([1u8; 32]);
|
||||
let num_vals = 100_000;
|
||||
let docs: Vec<_> = (0..num_vals)
|
||||
.map(|_i| {
|
||||
let id = if rng.gen_bool(0.01) {
|
||||
"veryfew".to_string() // 1%
|
||||
} else if rng.gen_bool(0.1) {
|
||||
"few".to_string() // 9%
|
||||
} else {
|
||||
"many".to_string() // 90%
|
||||
};
|
||||
Doc {
|
||||
id,
|
||||
// Multiply by 1000, so that we create many buckets in the compact space
|
||||
// The benches depend on this range to select n-percent of elements with the
|
||||
// methods below.
|
||||
ip: Ipv6Addr::from_u128(rng.gen_range(0..100) * 1000),
|
||||
}
|
||||
})
|
||||
.collect();
|
||||
|
||||
create_index_from_docs(&docs)
|
||||
}
|
||||
|
||||
fn get_90_percent() -> RangeInclusive<Ipv6Addr> {
|
||||
let start = Ipv6Addr::from_u128(0);
|
||||
let end = Ipv6Addr::from_u128(90 * 1000);
|
||||
start..=end
|
||||
}
|
||||
|
||||
fn get_10_percent() -> RangeInclusive<Ipv6Addr> {
|
||||
let start = Ipv6Addr::from_u128(0);
|
||||
let end = Ipv6Addr::from_u128(10 * 1000);
|
||||
start..=end
|
||||
}
|
||||
|
||||
fn get_1_percent() -> RangeInclusive<Ipv6Addr> {
|
||||
let start = Ipv6Addr::from_u128(10 * 1000);
|
||||
let end = Ipv6Addr::from_u128(10 * 1000);
|
||||
start..=end
|
||||
}
|
||||
|
||||
fn excute_query(
|
||||
field: &str,
|
||||
ip_range: RangeInclusive<Ipv6Addr>,
|
||||
suffix: &str,
|
||||
index: &Index,
|
||||
) -> usize {
|
||||
let gen_query_inclusive = |from: &Ipv6Addr, to: &Ipv6Addr| {
|
||||
format!(
|
||||
"{}:[{} TO {}] {}",
|
||||
field,
|
||||
&from.to_string(),
|
||||
&to.to_string(),
|
||||
suffix
|
||||
)
|
||||
};
|
||||
|
||||
let query = gen_query_inclusive(ip_range.start(), ip_range.end());
|
||||
let query_from_text = |text: &str| {
|
||||
QueryParser::for_index(index, vec![])
|
||||
.parse_query(text)
|
||||
.unwrap()
|
||||
};
|
||||
let query = query_from_text(&query);
|
||||
let reader = index.reader().unwrap();
|
||||
let searcher = reader.searcher();
|
||||
searcher.search(&query, &(Count)).unwrap()
|
||||
}
|
||||
|
||||
#[bench]
|
||||
fn bench_ip_range_hit_90_percent(bench: &mut Bencher) {
|
||||
let index = get_index_0_to_100();
|
||||
|
||||
bench.iter(|| excute_query("ip", get_90_percent(), "", &index));
|
||||
}
|
||||
|
||||
#[bench]
|
||||
fn bench_ip_range_hit_10_percent(bench: &mut Bencher) {
|
||||
let index = get_index_0_to_100();
|
||||
|
||||
bench.iter(|| excute_query("ip", get_10_percent(), "", &index));
|
||||
}
|
||||
|
||||
#[bench]
|
||||
fn bench_ip_range_hit_1_percent(bench: &mut Bencher) {
|
||||
let index = get_index_0_to_100();
|
||||
|
||||
bench.iter(|| excute_query("ip", get_1_percent(), "", &index));
|
||||
}
|
||||
|
||||
#[bench]
|
||||
fn bench_ip_range_hit_10_percent_intersect_with_10_percent(bench: &mut Bencher) {
|
||||
let index = get_index_0_to_100();
|
||||
|
||||
bench.iter(|| excute_query("ip", get_10_percent(), "AND id:few", &index));
|
||||
}
|
||||
|
||||
#[bench]
|
||||
fn bench_ip_range_hit_1_percent_intersect_with_10_percent(bench: &mut Bencher) {
|
||||
let index = get_index_0_to_100();
|
||||
|
||||
bench.iter(|| excute_query("ip", get_1_percent(), "AND id:few", &index));
|
||||
}
|
||||
|
||||
#[bench]
|
||||
fn bench_ip_range_hit_1_percent_intersect_with_90_percent(bench: &mut Bencher) {
|
||||
let index = get_index_0_to_100();
|
||||
|
||||
bench.iter(|| excute_query("ip", get_1_percent(), "AND id:many", &index));
|
||||
}
|
||||
|
||||
#[bench]
|
||||
fn bench_ip_range_hit_1_percent_intersect_with_1_percent(bench: &mut Bencher) {
|
||||
let index = get_index_0_to_100();
|
||||
|
||||
bench.iter(|| excute_query("ip", get_1_percent(), "AND id:veryfew", &index));
|
||||
}
|
||||
|
||||
#[bench]
|
||||
fn bench_ip_range_hit_10_percent_intersect_with_90_percent(bench: &mut Bencher) {
|
||||
let index = get_index_0_to_100();
|
||||
|
||||
bench.iter(|| excute_query("ip", get_10_percent(), "AND id:many", &index));
|
||||
}
|
||||
|
||||
#[bench]
|
||||
fn bench_ip_range_hit_90_percent_intersect_with_90_percent(bench: &mut Bencher) {
|
||||
let index = get_index_0_to_100();
|
||||
|
||||
bench.iter(|| excute_query("ip", get_90_percent(), "AND id:many", &index));
|
||||
}
|
||||
|
||||
#[bench]
|
||||
fn bench_ip_range_hit_90_percent_intersect_with_10_percent(bench: &mut Bencher) {
|
||||
let index = get_index_0_to_100();
|
||||
|
||||
bench.iter(|| excute_query("ip", get_90_percent(), "AND id:few", &index));
|
||||
}
|
||||
|
||||
#[bench]
|
||||
fn bench_ip_range_hit_90_percent_intersect_with_1_percent(bench: &mut Bencher) {
|
||||
let index = get_index_0_to_100();
|
||||
|
||||
bench.iter(|| excute_query("ip", get_90_percent(), "AND id:veryfew", &index));
|
||||
}
|
||||
|
||||
#[bench]
|
||||
fn bench_ip_range_hit_90_percent_multi(bench: &mut Bencher) {
|
||||
let index = get_index_0_to_100();
|
||||
|
||||
bench.iter(|| excute_query("ips", get_90_percent(), "", &index));
|
||||
}
|
||||
|
||||
#[bench]
|
||||
fn bench_ip_range_hit_10_percent_multi(bench: &mut Bencher) {
|
||||
let index = get_index_0_to_100();
|
||||
|
||||
bench.iter(|| excute_query("ips", get_10_percent(), "", &index));
|
||||
}
|
||||
|
||||
#[bench]
|
||||
fn bench_ip_range_hit_1_percent_multi(bench: &mut Bencher) {
|
||||
let index = get_index_0_to_100();
|
||||
|
||||
bench.iter(|| excute_query("ips", get_1_percent(), "", &index));
|
||||
}
|
||||
|
||||
#[bench]
|
||||
fn bench_ip_range_hit_10_percent_intersect_with_10_percent_multi(bench: &mut Bencher) {
|
||||
let index = get_index_0_to_100();
|
||||
|
||||
bench.iter(|| excute_query("ips", get_10_percent(), "AND id:few", &index));
|
||||
}
|
||||
|
||||
#[bench]
|
||||
fn bench_ip_range_hit_1_percent_intersect_with_10_percent_multi(bench: &mut Bencher) {
|
||||
let index = get_index_0_to_100();
|
||||
|
||||
bench.iter(|| excute_query("ips", get_1_percent(), "AND id:few", &index));
|
||||
}
|
||||
|
||||
#[bench]
|
||||
fn bench_ip_range_hit_1_percent_intersect_with_90_percent_multi(bench: &mut Bencher) {
|
||||
let index = get_index_0_to_100();
|
||||
|
||||
bench.iter(|| excute_query("ips", get_1_percent(), "AND id:many", &index));
|
||||
}
|
||||
|
||||
#[bench]
|
||||
fn bench_ip_range_hit_1_percent_intersect_with_1_percent_multi(bench: &mut Bencher) {
|
||||
let index = get_index_0_to_100();
|
||||
|
||||
bench.iter(|| excute_query("ips", get_1_percent(), "AND id:veryfew", &index));
|
||||
}
|
||||
|
||||
#[bench]
|
||||
fn bench_ip_range_hit_10_percent_intersect_with_90_percent_multi(bench: &mut Bencher) {
|
||||
let index = get_index_0_to_100();
|
||||
|
||||
bench.iter(|| excute_query("ips", get_10_percent(), "AND id:many", &index));
|
||||
}
|
||||
|
||||
#[bench]
|
||||
fn bench_ip_range_hit_90_percent_intersect_with_90_percent_multi(bench: &mut Bencher) {
|
||||
let index = get_index_0_to_100();
|
||||
|
||||
bench.iter(|| excute_query("ips", get_90_percent(), "AND id:many", &index));
|
||||
}
|
||||
|
||||
#[bench]
|
||||
fn bench_ip_range_hit_90_percent_intersect_with_10_percent_multi(bench: &mut Bencher) {
|
||||
let index = get_index_0_to_100();
|
||||
|
||||
bench.iter(|| excute_query("ips", get_90_percent(), "AND id:few", &index));
|
||||
}
|
||||
|
||||
#[bench]
|
||||
fn bench_ip_range_hit_90_percent_intersect_with_1_percent_multi(bench: &mut Bencher) {
|
||||
let index = get_index_0_to_100();
|
||||
|
||||
bench.iter(|| excute_query("ips", get_90_percent(), "AND id:veryfew", &index));
|
||||
}
|
||||
}
|
||||
@@ -2,54 +2,34 @@
|
||||
//! We use this variant only if the fastfield exists, otherwise the default in `range_query` is
|
||||
//! used, which uses the term dictionary + postings.
|
||||
|
||||
use std::net::Ipv6Addr;
|
||||
use std::ops::{Bound, RangeInclusive};
|
||||
|
||||
use columnar::{ColumnType, HasAssociatedColumnType, MonotonicallyMappableToU64};
|
||||
use columnar::{Column, MonotonicallyMappableToU128, MonotonicallyMappableToU64};
|
||||
use common::BinarySerializable;
|
||||
|
||||
use super::fast_field_range_query::RangeDocSet;
|
||||
use super::map_bound;
|
||||
use crate::query::{ConstScorer, EmptyScorer, Explanation, Query, Scorer, Weight};
|
||||
use crate::{DocId, DocSet, Score, SegmentReader, TantivyError};
|
||||
use super::fast_field_range_doc_set::RangeDocSet;
|
||||
use super::{map_bound, map_bound_res};
|
||||
use crate::query::range_query::range_query::inner_bound;
|
||||
use crate::query::{AllScorer, ConstScorer, EmptyScorer, Explanation, Query, Scorer, Weight};
|
||||
use crate::schema::{Field, Type};
|
||||
use crate::{DocId, DocSet, Score, SegmentReader, TantivyError, Term};
|
||||
|
||||
/// `FastFieldRangeWeight` uses the fast field to execute range queries.
|
||||
#[derive(Clone, Debug)]
|
||||
pub struct FastFieldRangeWeight {
|
||||
field: String,
|
||||
lower_bound: Bound<u64>,
|
||||
upper_bound: Bound<u64>,
|
||||
column_type_opt: Option<ColumnType>,
|
||||
lower_bound: Bound<Term>,
|
||||
upper_bound: Bound<Term>,
|
||||
field: Field,
|
||||
}
|
||||
|
||||
impl FastFieldRangeWeight {
|
||||
/// Create a new FastFieldRangeWeight, using the u64 representation of any fast field.
|
||||
pub(crate) fn new_u64_lenient(
|
||||
field: String,
|
||||
lower_bound: Bound<u64>,
|
||||
upper_bound: Bound<u64>,
|
||||
) -> Self {
|
||||
let lower_bound = map_bound(&lower_bound, |val| *val);
|
||||
let upper_bound = map_bound(&upper_bound, |val| *val);
|
||||
/// Create a new FastFieldRangeWeight
|
||||
pub(crate) fn new(field: Field, lower_bound: Bound<Term>, upper_bound: Bound<Term>) -> Self {
|
||||
Self {
|
||||
field,
|
||||
lower_bound,
|
||||
upper_bound,
|
||||
column_type_opt: None,
|
||||
}
|
||||
}
|
||||
|
||||
/// Create a new `FastFieldRangeWeight` for a range of a u64-mappable type .
|
||||
pub fn new<T: HasAssociatedColumnType + MonotonicallyMappableToU64>(
|
||||
field: String,
|
||||
lower_bound: Bound<T>,
|
||||
upper_bound: Bound<T>,
|
||||
) -> Self {
|
||||
let lower_bound = map_bound(&lower_bound, |val| val.to_u64());
|
||||
let upper_bound = map_bound(&upper_bound, |val| val.to_u64());
|
||||
Self {
|
||||
field,
|
||||
lower_bound,
|
||||
upper_bound,
|
||||
column_type_opt: Some(T::column_type()),
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -65,30 +45,86 @@ impl Query for FastFieldRangeWeight {
|
||||
|
||||
impl Weight for FastFieldRangeWeight {
|
||||
fn scorer(&self, reader: &SegmentReader, boost: Score) -> crate::Result<Box<dyn Scorer>> {
|
||||
let fast_field_reader = reader.fast_fields();
|
||||
let column_type_opt: Option<[ColumnType; 1]> =
|
||||
self.column_type_opt.map(|column_type| [column_type]);
|
||||
let column_type_opt_ref: Option<&[ColumnType]> = column_type_opt
|
||||
.as_ref()
|
||||
.map(|column_types| column_types.as_slice());
|
||||
let Some((column, _)) =
|
||||
fast_field_reader.u64_lenient_for_type(column_type_opt_ref, &self.field)?
|
||||
else {
|
||||
return Ok(Box::new(EmptyScorer));
|
||||
};
|
||||
#[allow(clippy::reversed_empty_ranges)]
|
||||
let value_range = bound_to_value_range(
|
||||
&self.lower_bound,
|
||||
&self.upper_bound,
|
||||
column.min_value(),
|
||||
column.max_value(),
|
||||
)
|
||||
.unwrap_or(1..=0); // empty range
|
||||
if value_range.is_empty() {
|
||||
return Ok(Box::new(EmptyScorer));
|
||||
// Check if both bounds are Bound::Unbounded
|
||||
if self.lower_bound == Bound::Unbounded && self.upper_bound == Bound::Unbounded {
|
||||
return Ok(Box::new(AllScorer::new(reader.max_doc())));
|
||||
}
|
||||
let field_name = reader.schema().get_field_name(self.field);
|
||||
let field_type = reader.schema().get_field_entry(self.field).field_type();
|
||||
if field_type.is_ip_addr() {
|
||||
let parse_ip_from_bytes = |term: &Term| {
|
||||
let ip_u128_bytes: [u8; 16] =
|
||||
term.serialized_value_bytes().try_into().map_err(|_| {
|
||||
crate::TantivyError::InvalidArgument(
|
||||
"Expected 8 bytes for ip address".to_string(),
|
||||
)
|
||||
})?;
|
||||
let ip_u128 = u128::from_be_bytes(ip_u128_bytes);
|
||||
crate::Result::<Ipv6Addr>::Ok(Ipv6Addr::from_u128(ip_u128))
|
||||
};
|
||||
let lower_bound = map_bound_res(&self.lower_bound, parse_ip_from_bytes)?;
|
||||
let upper_bound = map_bound_res(&self.upper_bound, parse_ip_from_bytes)?;
|
||||
|
||||
let Some(ip_addr_column): Option<Column<Ipv6Addr>> =
|
||||
reader.fast_fields().column_opt(field_name)?
|
||||
else {
|
||||
return Ok(Box::new(EmptyScorer));
|
||||
};
|
||||
let value_range = bound_to_value_range_ip(
|
||||
&lower_bound,
|
||||
&upper_bound,
|
||||
ip_addr_column.min_value(),
|
||||
ip_addr_column.max_value(),
|
||||
);
|
||||
let docset = RangeDocSet::new(value_range, ip_addr_column);
|
||||
Ok(Box::new(ConstScorer::new(docset, boost)))
|
||||
} else {
|
||||
assert!(
|
||||
maps_to_u64_fastfield(field_type.value_type()),
|
||||
"{:?}",
|
||||
field_type
|
||||
);
|
||||
|
||||
let term = inner_bound(&self.lower_bound)
|
||||
.or(inner_bound(&self.upper_bound))
|
||||
.expect("At least one bound must be set");
|
||||
assert_eq!(
|
||||
term.typ(),
|
||||
field_type.value_type(),
|
||||
"Field is of type {:?}, but got term of type {:?}",
|
||||
field_type,
|
||||
term.typ()
|
||||
);
|
||||
|
||||
let parse_from_bytes = |term: &Term| {
|
||||
u64::from_be(
|
||||
BinarySerializable::deserialize(&mut &term.serialized_value_bytes()[..])
|
||||
.unwrap(),
|
||||
)
|
||||
};
|
||||
|
||||
let lower_bound = map_bound(&self.lower_bound, parse_from_bytes);
|
||||
let upper_bound = map_bound(&self.upper_bound, parse_from_bytes);
|
||||
|
||||
let fast_field_reader = reader.fast_fields();
|
||||
let Some((column, _)) = fast_field_reader.u64_lenient_for_type(None, field_name)?
|
||||
else {
|
||||
return Ok(Box::new(EmptyScorer));
|
||||
};
|
||||
#[allow(clippy::reversed_empty_ranges)]
|
||||
let value_range = bound_to_value_range(
|
||||
&lower_bound,
|
||||
&upper_bound,
|
||||
column.min_value(),
|
||||
column.max_value(),
|
||||
)
|
||||
.unwrap_or(1..=0); // empty range
|
||||
if value_range.is_empty() {
|
||||
return Ok(Box::new(EmptyScorer));
|
||||
}
|
||||
let docset = RangeDocSet::new(value_range, column);
|
||||
Ok(Box::new(ConstScorer::new(docset, boost)))
|
||||
}
|
||||
let docset = RangeDocSet::new(value_range, column);
|
||||
Ok(Box::new(ConstScorer::new(docset, boost)))
|
||||
}
|
||||
|
||||
fn explain(&self, reader: &SegmentReader, doc: DocId) -> crate::Result<Explanation> {
|
||||
@@ -104,6 +140,35 @@ impl Weight for FastFieldRangeWeight {
|
||||
}
|
||||
}
|
||||
|
||||
/// Returns true if the type maps to a u64 fast field
|
||||
pub(crate) fn maps_to_u64_fastfield(typ: Type) -> bool {
|
||||
match typ {
|
||||
Type::U64 | Type::I64 | Type::F64 | Type::Bool | Type::Date => true,
|
||||
Type::IpAddr => false,
|
||||
Type::Str | Type::Facet | Type::Bytes | Type::Json => false,
|
||||
}
|
||||
}
|
||||
|
||||
fn bound_to_value_range_ip(
|
||||
lower_bound: &Bound<Ipv6Addr>,
|
||||
upper_bound: &Bound<Ipv6Addr>,
|
||||
min_value: Ipv6Addr,
|
||||
max_value: Ipv6Addr,
|
||||
) -> RangeInclusive<Ipv6Addr> {
|
||||
let start_value = match lower_bound {
|
||||
Bound::Included(ip_addr) => *ip_addr,
|
||||
Bound::Excluded(ip_addr) => Ipv6Addr::from(ip_addr.to_u128() + 1),
|
||||
Bound::Unbounded => min_value,
|
||||
};
|
||||
|
||||
let end_value = match upper_bound {
|
||||
Bound::Included(ip_addr) => *ip_addr,
|
||||
Bound::Excluded(ip_addr) => Ipv6Addr::from(ip_addr.to_u128() - 1),
|
||||
Bound::Unbounded => max_value,
|
||||
};
|
||||
start_value..=end_value
|
||||
}
|
||||
|
||||
// Returns None, if the range cannot be converted to a inclusive range (which equals to a empty
|
||||
// range).
|
||||
fn bound_to_value_range<T: MonotonicallyMappableToU64>(
|
||||
@@ -141,7 +206,7 @@ pub mod tests {
|
||||
use crate::query::range_query::range_query_u64_fastfield::FastFieldRangeWeight;
|
||||
use crate::query::{QueryParser, Weight};
|
||||
use crate::schema::{NumericOptions, Schema, SchemaBuilder, FAST, INDEXED, STORED, STRING};
|
||||
use crate::{Index, IndexWriter, TERMINATED};
|
||||
use crate::{Index, IndexWriter, Term, TERMINATED};
|
||||
|
||||
#[derive(Clone, Debug)]
|
||||
pub struct Doc {
|
||||
@@ -213,10 +278,10 @@ pub mod tests {
|
||||
writer.add_document(doc!(field=>52_000u64)).unwrap();
|
||||
writer.commit().unwrap();
|
||||
let searcher = index.reader().unwrap().searcher();
|
||||
let range_query = FastFieldRangeWeight::new_u64_lenient(
|
||||
"test_field".to_string(),
|
||||
Bound::Included(50_000),
|
||||
Bound::Included(50_002),
|
||||
let range_query = FastFieldRangeWeight::new(
|
||||
field,
|
||||
Bound::Included(Term::from_field_u64(field, 50_000)),
|
||||
Bound::Included(Term::from_field_u64(field, 50_002)),
|
||||
);
|
||||
let scorer = range_query
|
||||
.scorer(searcher.segment_reader(0), 1.0f32)
|
||||
@@ -394,6 +459,202 @@ pub mod tests {
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
pub mod ip_range_tests {
|
||||
use proptest::prelude::ProptestConfig;
|
||||
use proptest::strategy::Strategy;
|
||||
use proptest::{prop_oneof, proptest};
|
||||
|
||||
use super::*;
|
||||
use crate::collector::Count;
|
||||
use crate::query::QueryParser;
|
||||
use crate::schema::{Schema, FAST, INDEXED, STORED, STRING};
|
||||
use crate::{Index, IndexWriter};
|
||||
|
||||
#[derive(Clone, Debug)]
|
||||
pub struct Doc {
|
||||
pub id: String,
|
||||
pub ip: Ipv6Addr,
|
||||
}
|
||||
|
||||
fn operation_strategy() -> impl Strategy<Value = Doc> {
|
||||
prop_oneof![
|
||||
(0u64..10_000u64).prop_map(doc_from_id_1),
|
||||
(1u64..10_000u64).prop_map(doc_from_id_2),
|
||||
]
|
||||
}
|
||||
|
||||
pub fn doc_from_id_1(id: u64) -> Doc {
|
||||
let id = id * 1000;
|
||||
Doc {
|
||||
// ip != id
|
||||
id: id.to_string(),
|
||||
ip: Ipv6Addr::from_u128(id as u128),
|
||||
}
|
||||
}
|
||||
fn doc_from_id_2(id: u64) -> Doc {
|
||||
let id = id * 1000;
|
||||
Doc {
|
||||
// ip != id
|
||||
id: (id - 1).to_string(),
|
||||
ip: Ipv6Addr::from_u128(id as u128),
|
||||
}
|
||||
}
|
||||
|
||||
proptest! {
|
||||
#![proptest_config(ProptestConfig::with_cases(10))]
|
||||
#[test]
|
||||
fn test_ip_range_for_docs_prop(ops in proptest::collection::vec(operation_strategy(), 1..1000)) {
|
||||
assert!(test_ip_range_for_docs(&ops).is_ok());
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_ip_range_regression1() {
|
||||
let ops = &[doc_from_id_1(0)];
|
||||
assert!(test_ip_range_for_docs(ops).is_ok());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_ip_range_regression2() {
|
||||
let ops = &[
|
||||
doc_from_id_1(52),
|
||||
doc_from_id_1(63),
|
||||
doc_from_id_1(12),
|
||||
doc_from_id_2(91),
|
||||
doc_from_id_2(33),
|
||||
];
|
||||
assert!(test_ip_range_for_docs(ops).is_ok());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_ip_range_regression3() {
|
||||
let ops = &[doc_from_id_1(1), doc_from_id_1(2), doc_from_id_1(3)];
|
||||
assert!(test_ip_range_for_docs(ops).is_ok());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_ip_range_regression3_simple() {
|
||||
let mut schema_builder = Schema::builder();
|
||||
let ips_field = schema_builder.add_ip_addr_field("ips", FAST | INDEXED);
|
||||
let schema = schema_builder.build();
|
||||
let index = Index::create_in_ram(schema);
|
||||
let mut writer: IndexWriter = index.writer_for_tests().unwrap();
|
||||
let ip_addrs: Vec<Ipv6Addr> = [1000, 2000, 3000]
|
||||
.into_iter()
|
||||
.map(Ipv6Addr::from_u128)
|
||||
.collect();
|
||||
for &ip_addr in &ip_addrs {
|
||||
writer
|
||||
.add_document(doc!(ips_field=>ip_addr, ips_field=>ip_addr))
|
||||
.unwrap();
|
||||
}
|
||||
writer.commit().unwrap();
|
||||
let searcher = index.reader().unwrap().searcher();
|
||||
let range_weight = FastFieldRangeWeight::new(
|
||||
ips_field,
|
||||
Bound::Included(Term::from_field_ip_addr(ips_field, ip_addrs[1])),
|
||||
Bound::Included(Term::from_field_ip_addr(ips_field, ip_addrs[2])),
|
||||
);
|
||||
|
||||
let count =
|
||||
crate::query::weight::Weight::count(&range_weight, searcher.segment_reader(0)).unwrap();
|
||||
assert_eq!(count, 2);
|
||||
}
|
||||
|
||||
pub fn create_index_from_ip_docs(docs: &[Doc]) -> Index {
|
||||
let mut schema_builder = Schema::builder();
|
||||
let ip_field = schema_builder.add_ip_addr_field("ip", STORED | FAST);
|
||||
let ips_field = schema_builder.add_ip_addr_field("ips", FAST | INDEXED);
|
||||
let text_field = schema_builder.add_text_field("id", STRING | STORED);
|
||||
let schema = schema_builder.build();
|
||||
let index = Index::create_in_ram(schema);
|
||||
|
||||
{
|
||||
let mut index_writer = index.writer_with_num_threads(2, 60_000_000).unwrap();
|
||||
for doc in docs.iter() {
|
||||
index_writer
|
||||
.add_document(doc!(
|
||||
ips_field => doc.ip,
|
||||
ips_field => doc.ip,
|
||||
ip_field => doc.ip,
|
||||
text_field => doc.id.to_string(),
|
||||
))
|
||||
.unwrap();
|
||||
}
|
||||
|
||||
index_writer.commit().unwrap();
|
||||
}
|
||||
index
|
||||
}
|
||||
|
||||
fn test_ip_range_for_docs(docs: &[Doc]) -> crate::Result<()> {
|
||||
let index = create_index_from_ip_docs(docs);
|
||||
let reader = index.reader().unwrap();
|
||||
let searcher = reader.searcher();
|
||||
|
||||
let get_num_hits = |query| searcher.search(&query, &Count).unwrap();
|
||||
let query_from_text = |text: &str| {
|
||||
QueryParser::for_index(&index, vec![])
|
||||
.parse_query(text)
|
||||
.unwrap()
|
||||
};
|
||||
|
||||
let gen_query_inclusive = |field: &str, ip_range: &RangeInclusive<Ipv6Addr>| {
|
||||
format!("{field}:[{} TO {}]", ip_range.start(), ip_range.end())
|
||||
};
|
||||
|
||||
let test_sample = |sample_docs: &[Doc]| {
|
||||
let mut ips: Vec<Ipv6Addr> = sample_docs.iter().map(|doc| doc.ip).collect();
|
||||
ips.sort();
|
||||
let ip_range = ips[0]..=ips[1];
|
||||
let expected_num_hits = docs
|
||||
.iter()
|
||||
.filter(|doc| (ips[0]..=ips[1]).contains(&doc.ip))
|
||||
.count();
|
||||
|
||||
let query = gen_query_inclusive("ip", &ip_range);
|
||||
assert_eq!(get_num_hits(query_from_text(&query)), expected_num_hits);
|
||||
|
||||
let query = gen_query_inclusive("ips", &ip_range);
|
||||
assert_eq!(get_num_hits(query_from_text(&query)), expected_num_hits);
|
||||
|
||||
// Intersection search
|
||||
let id_filter = sample_docs[0].id.to_string();
|
||||
let expected_num_hits = docs
|
||||
.iter()
|
||||
.filter(|doc| ip_range.contains(&doc.ip) && doc.id == id_filter)
|
||||
.count();
|
||||
let query = format!(
|
||||
"{} AND id:{}",
|
||||
gen_query_inclusive("ip", &ip_range),
|
||||
&id_filter
|
||||
);
|
||||
assert_eq!(get_num_hits(query_from_text(&query)), expected_num_hits);
|
||||
|
||||
// Intersection search on multivalue ip field
|
||||
let id_filter = sample_docs[0].id.to_string();
|
||||
let query = format!(
|
||||
"{} AND id:{}",
|
||||
gen_query_inclusive("ips", &ip_range),
|
||||
&id_filter
|
||||
);
|
||||
assert_eq!(get_num_hits(query_from_text(&query)), expected_num_hits);
|
||||
};
|
||||
|
||||
test_sample(&[docs[0].clone(), docs[0].clone()]);
|
||||
if docs.len() > 1 {
|
||||
test_sample(&[docs[0].clone(), docs[1].clone()]);
|
||||
test_sample(&[docs[1].clone(), docs[1].clone()]);
|
||||
}
|
||||
if docs.len() > 2 {
|
||||
test_sample(&[docs[1].clone(), docs[2].clone()]);
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(all(test, feature = "unstable"))]
|
||||
mod bench {
|
||||
|
||||
@@ -601,3 +862,242 @@ mod bench {
|
||||
bench.iter(|| execute_query("ids", get_90_percent(), "AND id_name:veryfew", &index));
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(all(test, feature = "unstable"))]
|
||||
mod bench_ip {
|
||||
|
||||
use rand::rngs::StdRng;
|
||||
use rand::{Rng, SeedableRng};
|
||||
use test::Bencher;
|
||||
|
||||
use super::ip_range_tests::*;
|
||||
use super::*;
|
||||
use crate::collector::Count;
|
||||
use crate::query::QueryParser;
|
||||
use crate::Index;
|
||||
|
||||
fn get_index_0_to_100() -> Index {
|
||||
let mut rng = StdRng::from_seed([1u8; 32]);
|
||||
let num_vals = 100_000;
|
||||
let docs: Vec<_> = (0..num_vals)
|
||||
.map(|_i| {
|
||||
let id = if rng.gen_bool(0.01) {
|
||||
"veryfew".to_string() // 1%
|
||||
} else if rng.gen_bool(0.1) {
|
||||
"few".to_string() // 9%
|
||||
} else {
|
||||
"many".to_string() // 90%
|
||||
};
|
||||
Doc {
|
||||
id,
|
||||
// Multiply by 1000, so that we create many buckets in the compact space
|
||||
// The benches depend on this range to select n-percent of elements with the
|
||||
// methods below.
|
||||
ip: Ipv6Addr::from_u128(rng.gen_range(0..100) * 1000),
|
||||
}
|
||||
})
|
||||
.collect();
|
||||
|
||||
create_index_from_ip_docs(&docs)
|
||||
}
|
||||
|
||||
fn get_90_percent() -> RangeInclusive<Ipv6Addr> {
|
||||
let start = Ipv6Addr::from_u128(0);
|
||||
let end = Ipv6Addr::from_u128(90 * 1000);
|
||||
start..=end
|
||||
}
|
||||
|
||||
fn get_10_percent() -> RangeInclusive<Ipv6Addr> {
|
||||
let start = Ipv6Addr::from_u128(0);
|
||||
let end = Ipv6Addr::from_u128(10 * 1000);
|
||||
start..=end
|
||||
}
|
||||
|
||||
fn get_1_percent() -> RangeInclusive<Ipv6Addr> {
|
||||
let start = Ipv6Addr::from_u128(10 * 1000);
|
||||
let end = Ipv6Addr::from_u128(10 * 1000);
|
||||
start..=end
|
||||
}
|
||||
|
||||
fn excute_query(
|
||||
field: &str,
|
||||
ip_range: RangeInclusive<Ipv6Addr>,
|
||||
suffix: &str,
|
||||
index: &Index,
|
||||
) -> usize {
|
||||
let gen_query_inclusive = |from: &Ipv6Addr, to: &Ipv6Addr| {
|
||||
format!(
|
||||
"{}:[{} TO {}] {}",
|
||||
field,
|
||||
&from.to_string(),
|
||||
&to.to_string(),
|
||||
suffix
|
||||
)
|
||||
};
|
||||
|
||||
let query = gen_query_inclusive(ip_range.start(), ip_range.end());
|
||||
let query_from_text = |text: &str| {
|
||||
QueryParser::for_index(index, vec![])
|
||||
.parse_query(text)
|
||||
.unwrap()
|
||||
};
|
||||
let query = query_from_text(&query);
|
||||
let reader = index.reader().unwrap();
|
||||
let searcher = reader.searcher();
|
||||
searcher.search(&query, &(Count)).unwrap()
|
||||
}
|
||||
|
||||
#[bench]
|
||||
fn bench_ip_range_hit_90_percent(bench: &mut Bencher) {
|
||||
let index = get_index_0_to_100();
|
||||
|
||||
bench.iter(|| excute_query("ip", get_90_percent(), "", &index));
|
||||
}
|
||||
|
||||
#[bench]
|
||||
fn bench_ip_range_hit_10_percent(bench: &mut Bencher) {
|
||||
let index = get_index_0_to_100();
|
||||
|
||||
bench.iter(|| excute_query("ip", get_10_percent(), "", &index));
|
||||
}
|
||||
|
||||
#[bench]
|
||||
fn bench_ip_range_hit_1_percent(bench: &mut Bencher) {
|
||||
let index = get_index_0_to_100();
|
||||
|
||||
bench.iter(|| excute_query("ip", get_1_percent(), "", &index));
|
||||
}
|
||||
|
||||
#[bench]
|
||||
fn bench_ip_range_hit_10_percent_intersect_with_10_percent(bench: &mut Bencher) {
|
||||
let index = get_index_0_to_100();
|
||||
|
||||
bench.iter(|| excute_query("ip", get_10_percent(), "AND id:few", &index));
|
||||
}
|
||||
|
||||
#[bench]
|
||||
fn bench_ip_range_hit_1_percent_intersect_with_10_percent(bench: &mut Bencher) {
|
||||
let index = get_index_0_to_100();
|
||||
|
||||
bench.iter(|| excute_query("ip", get_1_percent(), "AND id:few", &index));
|
||||
}
|
||||
|
||||
#[bench]
|
||||
fn bench_ip_range_hit_1_percent_intersect_with_90_percent(bench: &mut Bencher) {
|
||||
let index = get_index_0_to_100();
|
||||
|
||||
bench.iter(|| excute_query("ip", get_1_percent(), "AND id:many", &index));
|
||||
}
|
||||
|
||||
#[bench]
|
||||
fn bench_ip_range_hit_1_percent_intersect_with_1_percent(bench: &mut Bencher) {
|
||||
let index = get_index_0_to_100();
|
||||
|
||||
bench.iter(|| excute_query("ip", get_1_percent(), "AND id:veryfew", &index));
|
||||
}
|
||||
|
||||
#[bench]
|
||||
fn bench_ip_range_hit_10_percent_intersect_with_90_percent(bench: &mut Bencher) {
|
||||
let index = get_index_0_to_100();
|
||||
|
||||
bench.iter(|| excute_query("ip", get_10_percent(), "AND id:many", &index));
|
||||
}
|
||||
|
||||
#[bench]
|
||||
fn bench_ip_range_hit_90_percent_intersect_with_90_percent(bench: &mut Bencher) {
|
||||
let index = get_index_0_to_100();
|
||||
|
||||
bench.iter(|| excute_query("ip", get_90_percent(), "AND id:many", &index));
|
||||
}
|
||||
|
||||
#[bench]
|
||||
fn bench_ip_range_hit_90_percent_intersect_with_10_percent(bench: &mut Bencher) {
|
||||
let index = get_index_0_to_100();
|
||||
|
||||
bench.iter(|| excute_query("ip", get_90_percent(), "AND id:few", &index));
|
||||
}
|
||||
|
||||
#[bench]
|
||||
fn bench_ip_range_hit_90_percent_intersect_with_1_percent(bench: &mut Bencher) {
|
||||
let index = get_index_0_to_100();
|
||||
|
||||
bench.iter(|| excute_query("ip", get_90_percent(), "AND id:veryfew", &index));
|
||||
}
|
||||
|
||||
#[bench]
|
||||
fn bench_ip_range_hit_90_percent_multi(bench: &mut Bencher) {
|
||||
let index = get_index_0_to_100();
|
||||
|
||||
bench.iter(|| excute_query("ips", get_90_percent(), "", &index));
|
||||
}
|
||||
|
||||
#[bench]
|
||||
fn bench_ip_range_hit_10_percent_multi(bench: &mut Bencher) {
|
||||
let index = get_index_0_to_100();
|
||||
|
||||
bench.iter(|| excute_query("ips", get_10_percent(), "", &index));
|
||||
}
|
||||
|
||||
#[bench]
|
||||
fn bench_ip_range_hit_1_percent_multi(bench: &mut Bencher) {
|
||||
let index = get_index_0_to_100();
|
||||
|
||||
bench.iter(|| excute_query("ips", get_1_percent(), "", &index));
|
||||
}
|
||||
|
||||
#[bench]
|
||||
fn bench_ip_range_hit_10_percent_intersect_with_10_percent_multi(bench: &mut Bencher) {
|
||||
let index = get_index_0_to_100();
|
||||
|
||||
bench.iter(|| excute_query("ips", get_10_percent(), "AND id:few", &index));
|
||||
}
|
||||
|
||||
#[bench]
|
||||
fn bench_ip_range_hit_1_percent_intersect_with_10_percent_multi(bench: &mut Bencher) {
|
||||
let index = get_index_0_to_100();
|
||||
|
||||
bench.iter(|| excute_query("ips", get_1_percent(), "AND id:few", &index));
|
||||
}
|
||||
|
||||
#[bench]
|
||||
fn bench_ip_range_hit_1_percent_intersect_with_90_percent_multi(bench: &mut Bencher) {
|
||||
let index = get_index_0_to_100();
|
||||
|
||||
bench.iter(|| excute_query("ips", get_1_percent(), "AND id:many", &index));
|
||||
}
|
||||
|
||||
#[bench]
|
||||
fn bench_ip_range_hit_1_percent_intersect_with_1_percent_multi(bench: &mut Bencher) {
|
||||
let index = get_index_0_to_100();
|
||||
|
||||
bench.iter(|| excute_query("ips", get_1_percent(), "AND id:veryfew", &index));
|
||||
}
|
||||
|
||||
#[bench]
|
||||
fn bench_ip_range_hit_10_percent_intersect_with_90_percent_multi(bench: &mut Bencher) {
|
||||
let index = get_index_0_to_100();
|
||||
|
||||
bench.iter(|| excute_query("ips", get_10_percent(), "AND id:many", &index));
|
||||
}
|
||||
|
||||
#[bench]
|
||||
fn bench_ip_range_hit_90_percent_intersect_with_90_percent_multi(bench: &mut Bencher) {
|
||||
let index = get_index_0_to_100();
|
||||
|
||||
bench.iter(|| excute_query("ips", get_90_percent(), "AND id:many", &index));
|
||||
}
|
||||
|
||||
#[bench]
|
||||
fn bench_ip_range_hit_90_percent_intersect_with_10_percent_multi(bench: &mut Bencher) {
|
||||
let index = get_index_0_to_100();
|
||||
|
||||
bench.iter(|| excute_query("ips", get_90_percent(), "AND id:few", &index));
|
||||
}
|
||||
|
||||
#[bench]
|
||||
fn bench_ip_range_hit_90_percent_intersect_with_1_percent_multi(bench: &mut Bencher) {
|
||||
let index = get_index_0_to_100();
|
||||
|
||||
bench.iter(|| excute_query("ips", get_90_percent(), "AND id:veryfew", &index));
|
||||
}
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user