refactor fast field query (#2452)

As preparation of #2023 and #1709

* Use Term to pass parameters
* merge u64 and ip fast field range query

Side note: I did not rename range_query_u64_fastfield, because then git can't track the changes.
This commit is contained in:
PSeitz
2024-07-15 19:08:05 +09:00
committed by GitHub
parent eab660873a
commit 1b4076691f
14 changed files with 682 additions and 951 deletions

View File

@@ -1,3 +1,5 @@
use std::ops::Bound;
// # Searching a range on an indexed int field.
//
// Below is an example of creating an indexed integer field in your schema
@@ -5,7 +7,7 @@
use tantivy::collector::Count;
use tantivy::query::RangeQuery;
use tantivy::schema::{Schema, INDEXED};
use tantivy::{doc, Index, IndexWriter, Result};
use tantivy::{doc, Index, IndexWriter, Result, Term};
fn main() -> Result<()> {
// For the sake of simplicity, this schema will only have 1 field
@@ -27,7 +29,10 @@ fn main() -> Result<()> {
reader.reload()?;
let searcher = reader.searcher();
// The end is excluded i.e. here we are searching up to 1969
let docs_in_the_sixties = RangeQuery::new_u64("year".to_string(), 1960..1970);
let docs_in_the_sixties = RangeQuery::new(
Bound::Included(Term::from_field_u64(year_field, 1960)),
Bound::Excluded(Term::from_field_u64(year_field, 1970)),
);
// Uses a Count collector to sum the total number of docs in the range
let num_60s_books = searcher.search(&docs_in_the_sixties, &Count)?;
assert_eq!(num_60s_books, 10);

View File

@@ -303,7 +303,7 @@ mod tests_mmap {
Type::Str,
),
(format!("{field_name_out_internal}a"), Type::Str),
(format!("{field_name_out_internal}"), Type::Str),
(field_name_out_internal.to_string(), Type::Str),
(format!("num{field_name_out_internal}"), Type::I64),
];
expected_fields.sort();

View File

@@ -22,10 +22,7 @@ pub struct AllWeight;
impl Weight for AllWeight {
fn scorer(&self, reader: &SegmentReader, boost: Score) -> crate::Result<Box<dyn Scorer>> {
let all_scorer = AllScorer {
doc: 0u32,
max_doc: reader.max_doc(),
};
let all_scorer = AllScorer::new(reader.max_doc());
Ok(Box::new(BoostScorer::new(all_scorer, boost)))
}
@@ -43,6 +40,13 @@ pub struct AllScorer {
max_doc: DocId,
}
impl AllScorer {
/// Creates a new AllScorer with `max_doc` docs.
pub fn new(max_doc: DocId) -> AllScorer {
AllScorer { doc: 0u32, max_doc }
}
}
impl DocSet for AllScorer {
#[inline(always)]
fn advance(&mut self) -> DocId {

View File

@@ -192,7 +192,7 @@ mod tests {
.cloned()
.map(VecDocSet::from)
.map(|d| ConstScorer::new(d, 1.0)),
DoNothingCombiner::default(),
DoNothingCombiner,
min_match,
)
};

View File

@@ -149,7 +149,7 @@ mod tests {
use crate::query::exist_query::ExistsQuery;
use crate::query::{BooleanQuery, RangeQuery};
use crate::schema::{Facet, FacetOptions, Schema, FAST, INDEXED, STRING, TEXT};
use crate::{Index, Searcher};
use crate::{Index, Searcher, Term};
#[test]
fn test_exists_query_simple() -> crate::Result<()> {
@@ -188,9 +188,8 @@ mod tests {
// exercise seek
let query = BooleanQuery::intersection(vec![
Box::new(RangeQuery::new_u64_bounds(
"all".to_string(),
Bound::Included(50),
Box::new(RangeQuery::new(
Bound::Included(Term::from_field_u64(all_field, 50)),
Bound::Unbounded,
)),
Box::new(ExistsQuery::new_exists_query("even".to_string())),
@@ -198,10 +197,9 @@ mod tests {
assert_eq!(searcher.search(&query, &Count)?, 25);
let query = BooleanQuery::intersection(vec![
Box::new(RangeQuery::new_u64_bounds(
"all".to_string(),
Bound::Included(0),
Bound::Excluded(50),
Box::new(RangeQuery::new(
Bound::Included(Term::from_field_u64(all_field, 0)),
Bound::Included(Term::from_field_u64(all_field, 50)),
)),
Box::new(ExistsQuery::new_exists_query("odd".to_string())),
]);

View File

@@ -54,7 +54,7 @@ pub use self::phrase_prefix_query::PhrasePrefixQuery;
pub use self::phrase_query::PhraseQuery;
pub use self::query::{EnableScoring, Query, QueryClone};
pub use self::query_parser::{QueryParser, QueryParserError};
pub use self::range_query::{FastFieldRangeWeight, IPFastFieldRangeWeight, RangeQuery};
pub use self::range_query::{FastFieldRangeWeight, RangeQuery};
pub use self::regex_query::RegexQuery;
pub use self::reqopt_scorer::RequiredOptionalScorer;
pub use self::score_combiner::{

View File

@@ -145,15 +145,7 @@ impl Query for PhrasePrefixQuery {
Bound::Unbounded
};
let mut range_query = RangeQuery::new_term_bounds(
enable_scoring
.schema()
.get_field_name(self.field)
.to_owned(),
self.prefix.1.typ(),
&Bound::Included(self.prefix.1.clone()),
&end_term,
);
let mut range_query = RangeQuery::new(Bound::Included(self.prefix.1.clone()), end_term);
range_query.limit(self.max_expansions as u64);
range_query.weight(enable_scoring)
}

View File

@@ -2,7 +2,7 @@ use std::fmt;
use std::ops::Bound;
use crate::query::Occur;
use crate::schema::{Term, Type};
use crate::schema::Term;
use crate::Score;
#[derive(Clone)]
@@ -14,8 +14,6 @@ pub enum LogicalLiteral {
prefix: bool,
},
Range {
field: String,
value_type: Type,
lower: Bound<Term>,
upper: Bound<Term>,
},

View File

@@ -790,8 +790,6 @@ impl QueryParser {
let (field, json_path) = try_tuple!(self
.split_full_path(&full_path)
.ok_or_else(|| QueryParserError::FieldDoesNotExist(full_path.clone())));
let field_entry = self.schema.get_field_entry(field);
let value_type = field_entry.field_type().value_type();
let mut errors = Vec::new();
let lower = match self.resolve_bound(field, json_path, &lower) {
Ok(bound) => bound,
@@ -812,12 +810,8 @@ impl QueryParser {
// we failed to parse something. Either way, there is no point emiting it
return (None, errors);
}
let logical_ast = LogicalAst::Leaf(Box::new(LogicalLiteral::Range {
field: self.schema.get_field_name(field).to_string(),
value_type,
lower,
upper,
}));
let logical_ast =
LogicalAst::Leaf(Box::new(LogicalLiteral::Range { lower, upper }));
(Some(logical_ast), errors)
}
UserInputLeaf::Set {
@@ -884,14 +878,7 @@ fn convert_literal_to_query(
Box::new(PhraseQuery::new_with_offset_and_slop(terms, slop))
}
}
LogicalLiteral::Range {
field,
value_type,
lower,
upper,
} => Box::new(RangeQuery::new_term_bounds(
field, value_type, &lower, &upper,
)),
LogicalLiteral::Range { lower, upper } => Box::new(RangeQuery::new(lower, upper)),
LogicalLiteral::Set { elements, .. } => Box::new(TermSetQuery::new(elements)),
LogicalLiteral::All => Box::new(AllQuery),
}
@@ -1136,8 +1123,8 @@ mod test {
let query = make_query_parser().parse_query("title:[A TO B]").unwrap();
assert_eq!(
format!("{query:?}"),
"RangeQuery { field: \"title\", value_type: Str, lower_bound: Included([97]), \
upper_bound: Included([98]), limit: None }"
"RangeQuery { lower_bound: Included(Term(field=0, type=Str, \"a\")), upper_bound: \
Included(Term(field=0, type=Str, \"b\")), limit: None }"
);
}

View File

@@ -180,10 +180,12 @@ impl<T: Send + Sync + PartialOrd + Copy + Debug + 'static> DocSet for RangeDocSe
#[cfg(test)]
mod tests {
use std::ops::Bound;
use crate::collector::Count;
use crate::directory::RamDirectory;
use crate::query::RangeQuery;
use crate::{schema, IndexBuilder, TantivyDocument};
use crate::{schema, IndexBuilder, TantivyDocument, Term};
#[test]
fn range_query_fast_optional_field_minimum() {
@@ -218,10 +220,9 @@ mod tests {
let reader = index.reader().unwrap();
let searcher = reader.searcher();
let query = RangeQuery::new_u64_bounds(
"score".to_string(),
std::ops::Bound::Included(70),
std::ops::Bound::Unbounded,
let query = RangeQuery::new(
Bound::Included(Term::from_field_u64(score_field, 70)),
Bound::Unbounded,
);
let count = searcher.search(&query, &Count).unwrap();

View File

@@ -2,13 +2,11 @@ use std::ops::Bound;
use crate::schema::Type;
mod fast_field_range_query;
mod fast_field_range_doc_set;
mod range_query;
mod range_query_ip_fastfield;
mod range_query_u64_fastfield;
pub use self::range_query::RangeQuery;
pub use self::range_query_ip_fastfield::IPFastFieldRangeWeight;
pub use self::range_query_u64_fastfield::FastFieldRangeWeight;
// TODO is this correct?

View File

@@ -1,21 +1,17 @@
use std::io;
use std::net::Ipv6Addr;
use std::ops::{Bound, Range};
use std::ops::Bound;
use columnar::MonotonicallyMappableToU128;
use common::{BinarySerializable, BitSet};
use common::BitSet;
use super::map_bound;
use super::range_query_u64_fastfield::FastFieldRangeWeight;
use crate::error::TantivyError;
use crate::index::SegmentReader;
use crate::query::explanation::does_not_match;
use crate::query::range_query::range_query_ip_fastfield::IPFastFieldRangeWeight;
use crate::query::range_query::{is_type_valid_for_fastfield_range_query, map_bound_res};
use crate::query::range_query::is_type_valid_for_fastfield_range_query;
use crate::query::{BitSetDocSet, ConstScorer, EnableScoring, Explanation, Query, Scorer, Weight};
use crate::schema::{Field, IndexRecordOption, Term, Type};
use crate::termdict::{TermDictionary, TermStreamer};
use crate::{DateTime, DocId, Score};
use crate::{DocId, Score};
/// `RangeQuery` matches all documents that have at least one term within a defined range.
///
@@ -40,8 +36,10 @@ use crate::{DateTime, DocId, Score};
/// ```rust
/// use tantivy::collector::Count;
/// use tantivy::query::RangeQuery;
/// use tantivy::Term;
/// use tantivy::schema::{Schema, INDEXED};
/// use tantivy::{doc, Index, IndexWriter};
/// use std::ops::Bound;
/// # fn test() -> tantivy::Result<()> {
/// let mut schema_builder = Schema::builder();
/// let year_field = schema_builder.add_u64_field("year", INDEXED);
@@ -59,7 +57,10 @@ use crate::{DateTime, DocId, Score};
///
/// let reader = index.reader()?;
/// let searcher = reader.searcher();
/// let docs_in_the_sixties = RangeQuery::new_u64("year".to_string(), 1960..1970);
/// let docs_in_the_sixties = RangeQuery::new(
/// Bound::Included(Term::from_field_u64(year_field, 1960)),
/// Bound::Excluded(Term::from_field_u64(year_field, 1970)),
/// );
/// let num_60s_books = searcher.search(&docs_in_the_sixties, &Count)?;
/// assert_eq!(num_60s_books, 2285);
/// Ok(())
@@ -68,246 +69,46 @@ use crate::{DateTime, DocId, Score};
/// ```
#[derive(Clone, Debug)]
pub struct RangeQuery {
field: String,
value_type: Type,
lower_bound: Bound<Vec<u8>>,
upper_bound: Bound<Vec<u8>>,
lower_bound: Bound<Term>,
upper_bound: Bound<Term>,
limit: Option<u64>,
}
/// Returns the inner value of a `Bound`
pub(crate) fn inner_bound(val: &Bound<Term>) -> Option<&Term> {
match val {
Bound::Included(term) | Bound::Excluded(term) => Some(term),
Bound::Unbounded => None,
}
}
impl RangeQuery {
/// Creates a new `RangeQuery` from bounded start and end terms.
///
/// If the value type is not correct, something may go terribly wrong when
/// the `Weight` object is created.
pub fn new_term_bounds(
field: String,
value_type: Type,
lower_bound: &Bound<Term>,
upper_bound: &Bound<Term>,
) -> RangeQuery {
let verify_and_unwrap_term = |val: &Term| val.serialized_value_bytes().to_owned();
pub fn new(lower_bound: Bound<Term>, upper_bound: Bound<Term>) -> RangeQuery {
RangeQuery {
field,
value_type,
lower_bound: map_bound(lower_bound, verify_and_unwrap_term),
upper_bound: map_bound(upper_bound, verify_and_unwrap_term),
lower_bound,
upper_bound,
limit: None,
}
}
/// Creates a new `RangeQuery` over a `i64` field.
///
/// If the field is not of the type `i64`, tantivy
/// will panic when the `Weight` object is created.
pub fn new_i64(field: String, range: Range<i64>) -> RangeQuery {
RangeQuery::new_i64_bounds(
field,
Bound::Included(range.start),
Bound::Excluded(range.end),
)
}
/// Create a new `RangeQuery` over a `i64` field.
///
/// The two `Bound` arguments make it possible to create more complex
/// ranges than semi-inclusive range.
///
/// If the field is not of the type `i64`, tantivy
/// will panic when the `Weight` object is created.
pub fn new_i64_bounds(
field: String,
lower_bound: Bound<i64>,
upper_bound: Bound<i64>,
) -> RangeQuery {
let make_term_val = |val: &i64| {
Term::from_field_i64(Field::from_field_id(0), *val)
.serialized_value_bytes()
.to_owned()
};
RangeQuery {
field,
value_type: Type::I64,
lower_bound: map_bound(&lower_bound, make_term_val),
upper_bound: map_bound(&upper_bound, make_term_val),
limit: None,
}
}
/// Creates a new `RangeQuery` over a `f64` field.
///
/// If the field is not of the type `f64`, tantivy
/// will panic when the `Weight` object is created.
pub fn new_f64(field: String, range: Range<f64>) -> RangeQuery {
RangeQuery::new_f64_bounds(
field,
Bound::Included(range.start),
Bound::Excluded(range.end),
)
}
/// Create a new `RangeQuery` over a `f64` field.
///
/// The two `Bound` arguments make it possible to create more complex
/// ranges than semi-inclusive range.
///
/// If the field is not of the type `f64`, tantivy
/// will panic when the `Weight` object is created.
pub fn new_f64_bounds(
field: String,
lower_bound: Bound<f64>,
upper_bound: Bound<f64>,
) -> RangeQuery {
let make_term_val = |val: &f64| {
Term::from_field_f64(Field::from_field_id(0), *val)
.serialized_value_bytes()
.to_owned()
};
RangeQuery {
field,
value_type: Type::F64,
lower_bound: map_bound(&lower_bound, make_term_val),
upper_bound: map_bound(&upper_bound, make_term_val),
limit: None,
}
}
/// Create a new `RangeQuery` over a `u64` field.
///
/// The two `Bound` arguments make it possible to create more complex
/// ranges than semi-inclusive range.
///
/// If the field is not of the type `u64`, tantivy
/// will panic when the `Weight` object is created.
pub fn new_u64_bounds(
field: String,
lower_bound: Bound<u64>,
upper_bound: Bound<u64>,
) -> RangeQuery {
let make_term_val = |val: &u64| {
Term::from_field_u64(Field::from_field_id(0), *val)
.serialized_value_bytes()
.to_owned()
};
RangeQuery {
field,
value_type: Type::U64,
lower_bound: map_bound(&lower_bound, make_term_val),
upper_bound: map_bound(&upper_bound, make_term_val),
limit: None,
}
}
/// Create a new `RangeQuery` over a `ip` field.
///
/// If the field is not of the type `ip`, tantivy
/// will panic when the `Weight` object is created.
pub fn new_ip_bounds(
field: String,
lower_bound: Bound<Ipv6Addr>,
upper_bound: Bound<Ipv6Addr>,
) -> RangeQuery {
let make_term_val = |val: &Ipv6Addr| {
Term::from_field_ip_addr(Field::from_field_id(0), *val)
.serialized_value_bytes()
.to_owned()
};
RangeQuery {
field,
value_type: Type::IpAddr,
lower_bound: map_bound(&lower_bound, make_term_val),
upper_bound: map_bound(&upper_bound, make_term_val),
limit: None,
}
}
/// Create a new `RangeQuery` over a `u64` field.
///
/// If the field is not of the type `u64`, tantivy
/// will panic when the `Weight` object is created.
pub fn new_u64(field: String, range: Range<u64>) -> RangeQuery {
RangeQuery::new_u64_bounds(
field,
Bound::Included(range.start),
Bound::Excluded(range.end),
)
}
/// Create a new `RangeQuery` over a `date` field.
///
/// The two `Bound` arguments make it possible to create more complex
/// ranges than semi-inclusive range.
///
/// If the field is not of the type `date`, tantivy
/// will panic when the `Weight` object is created.
pub fn new_date_bounds(
field: String,
lower_bound: Bound<DateTime>,
upper_bound: Bound<DateTime>,
) -> RangeQuery {
let make_term_val = |val: &DateTime| {
Term::from_field_date(Field::from_field_id(0), *val)
.serialized_value_bytes()
.to_owned()
};
RangeQuery {
field,
value_type: Type::Date,
lower_bound: map_bound(&lower_bound, make_term_val),
upper_bound: map_bound(&upper_bound, make_term_val),
limit: None,
}
}
/// Create a new `RangeQuery` over a `date` field.
///
/// If the field is not of the type `date`, tantivy
/// will panic when the `Weight` object is created.
pub fn new_date(field: String, range: Range<DateTime>) -> RangeQuery {
RangeQuery::new_date_bounds(
field,
Bound::Included(range.start),
Bound::Excluded(range.end),
)
}
/// Create a new `RangeQuery` over a `Str` field.
///
/// The two `Bound` arguments make it possible to create more complex
/// ranges than semi-inclusive range.
///
/// If the field is not of the type `Str`, tantivy
/// will panic when the `Weight` object is created.
pub fn new_str_bounds(
field: String,
lower_bound: Bound<&str>,
upper_bound: Bound<&str>,
) -> RangeQuery {
let make_term_val = |val: &&str| val.as_bytes().to_vec();
RangeQuery {
field,
value_type: Type::Str,
lower_bound: map_bound(&lower_bound, make_term_val),
upper_bound: map_bound(&upper_bound, make_term_val),
limit: None,
}
}
/// Create a new `RangeQuery` over a `Str` field.
///
/// If the field is not of the type `Str`, tantivy
/// will panic when the `Weight` object is created.
pub fn new_str(field: String, range: Range<&str>) -> RangeQuery {
RangeQuery::new_str_bounds(
field,
Bound::Included(range.start),
Bound::Excluded(range.end),
)
}
/// Field to search over
pub fn field(&self) -> &str {
&self.field
pub fn field(&self) -> Field {
self.get_term().field()
}
/// The value type of the field
pub fn value_type(&self) -> Type {
self.get_term().typ()
}
pub(crate) fn get_term(&self) -> &Term {
inner_bound(&self.lower_bound)
.or(inner_bound(&self.upper_bound))
.expect("At least one bound must be set")
}
/// Limit the number of term the `RangeQuery` will go through.
@@ -319,70 +120,23 @@ impl RangeQuery {
}
}
/// Returns true if the type maps to a u64 fast field
pub(crate) fn maps_to_u64_fastfield(typ: Type) -> bool {
match typ {
Type::U64 | Type::I64 | Type::F64 | Type::Bool | Type::Date => true,
Type::IpAddr => false,
Type::Str | Type::Facet | Type::Bytes | Type::Json => false,
}
}
impl Query for RangeQuery {
fn weight(&self, enable_scoring: EnableScoring<'_>) -> crate::Result<Box<dyn Weight>> {
let schema = enable_scoring.schema();
let field_type = schema
.get_field_entry(schema.get_field(&self.field)?)
.field_type();
let value_type = field_type.value_type();
if value_type != self.value_type {
let err_msg = format!(
"Create a range query of the type {:?}, when the field given was of type \
{value_type:?}",
self.value_type
);
return Err(TantivyError::SchemaError(err_msg));
}
let field_type = schema.get_field_entry(self.field()).field_type();
if field_type.is_fast() && is_type_valid_for_fastfield_range_query(self.value_type) {
if field_type.is_ip_addr() {
let parse_ip_from_bytes = |data: &Vec<u8>| {
let ip_u128_bytes: [u8; 16] = data.as_slice().try_into().map_err(|_| {
crate::TantivyError::InvalidArgument(
"Expected 8 bytes for ip address".to_string(),
)
})?;
let ip_u128 = u128::from_be_bytes(ip_u128_bytes);
crate::Result::<Ipv6Addr>::Ok(Ipv6Addr::from_u128(ip_u128))
};
let lower_bound = map_bound_res(&self.lower_bound, parse_ip_from_bytes)?;
let upper_bound = map_bound_res(&self.upper_bound, parse_ip_from_bytes)?;
Ok(Box::new(IPFastFieldRangeWeight::new(
self.field.to_string(),
lower_bound,
upper_bound,
)))
} else {
// We run the range query on u64 value space for performance reasons and simpicity
// assert the type maps to u64
assert!(maps_to_u64_fastfield(self.value_type));
let parse_from_bytes = |data: &Vec<u8>| {
u64::from_be(BinarySerializable::deserialize(&mut &data[..]).unwrap())
};
let lower_bound = map_bound(&self.lower_bound, parse_from_bytes);
let upper_bound = map_bound(&self.upper_bound, parse_from_bytes);
Ok(Box::new(FastFieldRangeWeight::new_u64_lenient(
self.field.to_string(),
lower_bound,
upper_bound,
)))
}
if field_type.is_fast() && is_type_valid_for_fastfield_range_query(self.value_type()) {
Ok(Box::new(FastFieldRangeWeight::new(
self.field(),
self.lower_bound.clone(),
self.upper_bound.clone(),
)))
} else {
let verify_and_unwrap_term = |val: &Term| val.serialized_value_bytes().to_owned();
Ok(Box::new(RangeWeight {
field: self.field.to_string(),
lower_bound: self.lower_bound.clone(),
upper_bound: self.upper_bound.clone(),
field: self.field(),
lower_bound: map_bound(&self.lower_bound, verify_and_unwrap_term),
upper_bound: map_bound(&self.upper_bound, verify_and_unwrap_term),
limit: self.limit,
}))
}
@@ -390,7 +144,7 @@ impl Query for RangeQuery {
}
pub struct RangeWeight {
field: String,
field: Field,
lower_bound: Bound<Vec<u8>>,
upper_bound: Bound<Vec<u8>>,
limit: Option<u64>,
@@ -423,7 +177,7 @@ impl Weight for RangeWeight {
let max_doc = reader.max_doc();
let mut doc_bitset = BitSet::with_max_value(max_doc);
let inverted_index = reader.inverted_index(reader.schema().get_field(&self.field)?)?;
let inverted_index = reader.inverted_index(self.field)?;
let term_dict = inverted_index.terms();
let mut term_range = self.term_range(term_dict)?;
let mut processed_count = 0;
@@ -477,7 +231,7 @@ mod tests {
use crate::schema::{
Field, IntoIpv6Addr, Schema, TantivyDocument, FAST, INDEXED, STORED, TEXT,
};
use crate::{Index, IndexWriter};
use crate::{Index, IndexWriter, Term};
#[test]
fn test_range_query_simple() -> crate::Result<()> {
@@ -499,7 +253,10 @@ mod tests {
let reader = index.reader()?;
let searcher = reader.searcher();
let docs_in_the_sixties = RangeQuery::new_u64("year".to_string(), 1960u64..1970u64);
let docs_in_the_sixties = RangeQuery::new(
Bound::Included(Term::from_field_u64(year_field, 1960)),
Bound::Excluded(Term::from_field_u64(year_field, 1970)),
);
// ... or `1960..=1969` if inclusive range is enabled.
let count = searcher.search(&docs_in_the_sixties, &Count)?;
@@ -530,7 +287,10 @@ mod tests {
let reader = index.reader()?;
let searcher = reader.searcher();
let mut docs_in_the_sixties = RangeQuery::new_u64("year".to_string(), 1960u64..1970u64);
let mut docs_in_the_sixties = RangeQuery::new(
Bound::Included(Term::from_field_u64(year_field, 1960)),
Bound::Excluded(Term::from_field_u64(year_field, 1970)),
);
docs_in_the_sixties.limit(5);
// due to the limit and no docs in 1963, it's really only 1960..=1965
@@ -575,29 +335,29 @@ mod tests {
|range_query: RangeQuery| searcher.search(&range_query, &Count).unwrap();
assert_eq!(
count_multiples(RangeQuery::new_i64("intfield".to_string(), 10..11)),
count_multiples(RangeQuery::new(
Bound::Included(Term::from_field_i64(int_field, 10)),
Bound::Excluded(Term::from_field_i64(int_field, 11)),
)),
9
);
assert_eq!(
count_multiples(RangeQuery::new_i64_bounds(
"intfield".to_string(),
Bound::Included(10),
Bound::Included(11)
count_multiples(RangeQuery::new(
Bound::Included(Term::from_field_i64(int_field, 10)),
Bound::Included(Term::from_field_i64(int_field, 11)),
)),
18
);
assert_eq!(
count_multiples(RangeQuery::new_i64_bounds(
"intfield".to_string(),
Bound::Excluded(9),
Bound::Included(10)
count_multiples(RangeQuery::new(
Bound::Excluded(Term::from_field_i64(int_field, 9)),
Bound::Included(Term::from_field_i64(int_field, 10)),
)),
9
);
assert_eq!(
count_multiples(RangeQuery::new_i64_bounds(
"intfield".to_string(),
Bound::Included(9),
count_multiples(RangeQuery::new(
Bound::Included(Term::from_field_i64(int_field, 9)),
Bound::Unbounded
)),
91
@@ -646,29 +406,29 @@ mod tests {
|range_query: RangeQuery| searcher.search(&range_query, &Count).unwrap();
assert_eq!(
count_multiples(RangeQuery::new_f64("floatfield".to_string(), 10.0..11.0)),
count_multiples(RangeQuery::new(
Bound::Included(Term::from_field_f64(float_field, 10.0)),
Bound::Excluded(Term::from_field_f64(float_field, 11.0)),
)),
9
);
assert_eq!(
count_multiples(RangeQuery::new_f64_bounds(
"floatfield".to_string(),
Bound::Included(10.0),
Bound::Included(11.0)
count_multiples(RangeQuery::new(
Bound::Included(Term::from_field_f64(float_field, 10.0)),
Bound::Included(Term::from_field_f64(float_field, 11.0)),
)),
18
);
assert_eq!(
count_multiples(RangeQuery::new_f64_bounds(
"floatfield".to_string(),
Bound::Excluded(9.0),
Bound::Included(10.0)
count_multiples(RangeQuery::new(
Bound::Excluded(Term::from_field_f64(float_field, 9.0)),
Bound::Included(Term::from_field_f64(float_field, 10.0)),
)),
9
);
assert_eq!(
count_multiples(RangeQuery::new_f64_bounds(
"floatfield".to_string(),
Bound::Included(9.0),
count_multiples(RangeQuery::new(
Bound::Included(Term::from_field_f64(float_field, 9.0)),
Bound::Unbounded
)),
91

View File

@@ -1,512 +0,0 @@
//! IP Fastfields support efficient scanning for range queries.
//! We use this variant only if the fastfield exists, otherwise the default in `range_query` is
//! used, which uses the term dictionary + postings.
use std::net::Ipv6Addr;
use std::ops::{Bound, RangeInclusive};
use columnar::{Column, MonotonicallyMappableToU128};
use crate::query::range_query::fast_field_range_query::RangeDocSet;
use crate::query::{ConstScorer, EmptyScorer, Explanation, Scorer, Weight};
use crate::{DocId, DocSet, Score, SegmentReader, TantivyError};
/// `IPFastFieldRangeWeight` uses the ip address fast field to execute range queries.
pub struct IPFastFieldRangeWeight {
field: String,
lower_bound: Bound<Ipv6Addr>,
upper_bound: Bound<Ipv6Addr>,
}
impl IPFastFieldRangeWeight {
/// Creates a new IPFastFieldRangeWeight.
pub fn new(field: String, lower_bound: Bound<Ipv6Addr>, upper_bound: Bound<Ipv6Addr>) -> Self {
Self {
field,
lower_bound,
upper_bound,
}
}
}
impl Weight for IPFastFieldRangeWeight {
fn scorer(&self, reader: &SegmentReader, boost: Score) -> crate::Result<Box<dyn Scorer>> {
let Some(ip_addr_column): Option<Column<Ipv6Addr>> =
reader.fast_fields().column_opt(&self.field)?
else {
return Ok(Box::new(EmptyScorer));
};
let value_range = bound_to_value_range(
&self.lower_bound,
&self.upper_bound,
ip_addr_column.min_value(),
ip_addr_column.max_value(),
);
let docset = RangeDocSet::new(value_range, ip_addr_column);
Ok(Box::new(ConstScorer::new(docset, boost)))
}
fn explain(&self, reader: &SegmentReader, doc: DocId) -> crate::Result<Explanation> {
let mut scorer = self.scorer(reader, 1.0)?;
if scorer.seek(doc) != doc {
return Err(TantivyError::InvalidArgument(format!(
"Document #({doc}) does not match"
)));
}
let explanation = Explanation::new("Const", scorer.score());
Ok(explanation)
}
}
fn bound_to_value_range(
lower_bound: &Bound<Ipv6Addr>,
upper_bound: &Bound<Ipv6Addr>,
min_value: Ipv6Addr,
max_value: Ipv6Addr,
) -> RangeInclusive<Ipv6Addr> {
let start_value = match lower_bound {
Bound::Included(ip_addr) => *ip_addr,
Bound::Excluded(ip_addr) => Ipv6Addr::from(ip_addr.to_u128() + 1),
Bound::Unbounded => min_value,
};
let end_value = match upper_bound {
Bound::Included(ip_addr) => *ip_addr,
Bound::Excluded(ip_addr) => Ipv6Addr::from(ip_addr.to_u128() - 1),
Bound::Unbounded => max_value,
};
start_value..=end_value
}
#[cfg(test)]
pub mod tests {
use proptest::prelude::ProptestConfig;
use proptest::strategy::Strategy;
use proptest::{prop_oneof, proptest};
use super::*;
use crate::collector::Count;
use crate::query::QueryParser;
use crate::schema::{Schema, FAST, INDEXED, STORED, STRING};
use crate::{Index, IndexWriter};
#[derive(Clone, Debug)]
pub struct Doc {
pub id: String,
pub ip: Ipv6Addr,
}
fn operation_strategy() -> impl Strategy<Value = Doc> {
prop_oneof![
(0u64..10_000u64).prop_map(doc_from_id_1),
(1u64..10_000u64).prop_map(doc_from_id_2),
]
}
pub fn doc_from_id_1(id: u64) -> Doc {
let id = id * 1000;
Doc {
// ip != id
id: id.to_string(),
ip: Ipv6Addr::from_u128(id as u128),
}
}
fn doc_from_id_2(id: u64) -> Doc {
let id = id * 1000;
Doc {
// ip != id
id: (id - 1).to_string(),
ip: Ipv6Addr::from_u128(id as u128),
}
}
proptest! {
#![proptest_config(ProptestConfig::with_cases(10))]
#[test]
fn test_ip_range_for_docs_prop(ops in proptest::collection::vec(operation_strategy(), 1..1000)) {
assert!(test_ip_range_for_docs(&ops).is_ok());
}
}
#[test]
fn test_ip_range_regression1() {
let ops = &[doc_from_id_1(0)];
assert!(test_ip_range_for_docs(ops).is_ok());
}
#[test]
fn test_ip_range_regression2() {
let ops = &[
doc_from_id_1(52),
doc_from_id_1(63),
doc_from_id_1(12),
doc_from_id_2(91),
doc_from_id_2(33),
];
assert!(test_ip_range_for_docs(ops).is_ok());
}
#[test]
fn test_ip_range_regression3() {
let ops = &[doc_from_id_1(1), doc_from_id_1(2), doc_from_id_1(3)];
assert!(test_ip_range_for_docs(ops).is_ok());
}
#[test]
fn test_ip_range_regression3_simple() {
let mut schema_builder = Schema::builder();
let ips_field = schema_builder.add_ip_addr_field("ips", FAST | INDEXED);
let schema = schema_builder.build();
let index = Index::create_in_ram(schema);
let mut writer: IndexWriter = index.writer_for_tests().unwrap();
let ip_addrs: Vec<Ipv6Addr> = [1000, 2000, 3000]
.into_iter()
.map(Ipv6Addr::from_u128)
.collect();
for &ip_addr in &ip_addrs {
writer
.add_document(doc!(ips_field=>ip_addr, ips_field=>ip_addr))
.unwrap();
}
writer.commit().unwrap();
let searcher = index.reader().unwrap().searcher();
let range_weight = IPFastFieldRangeWeight {
field: "ips".to_string(),
lower_bound: Bound::Included(ip_addrs[1]),
upper_bound: Bound::Included(ip_addrs[2]),
};
let count = range_weight.count(searcher.segment_reader(0)).unwrap();
assert_eq!(count, 2);
}
pub fn create_index_from_docs(docs: &[Doc]) -> Index {
let mut schema_builder = Schema::builder();
let ip_field = schema_builder.add_ip_addr_field("ip", STORED | FAST);
let ips_field = schema_builder.add_ip_addr_field("ips", FAST | INDEXED);
let text_field = schema_builder.add_text_field("id", STRING | STORED);
let schema = schema_builder.build();
let index = Index::create_in_ram(schema);
{
let mut index_writer = index.writer_with_num_threads(2, 60_000_000).unwrap();
for doc in docs.iter() {
index_writer
.add_document(doc!(
ips_field => doc.ip,
ips_field => doc.ip,
ip_field => doc.ip,
text_field => doc.id.to_string(),
))
.unwrap();
}
index_writer.commit().unwrap();
}
index
}
fn test_ip_range_for_docs(docs: &[Doc]) -> crate::Result<()> {
let index = create_index_from_docs(docs);
let reader = index.reader().unwrap();
let searcher = reader.searcher();
let get_num_hits = |query| searcher.search(&query, &Count).unwrap();
let query_from_text = |text: &str| {
QueryParser::for_index(&index, vec![])
.parse_query(text)
.unwrap()
};
let gen_query_inclusive = |field: &str, ip_range: &RangeInclusive<Ipv6Addr>| {
format!("{field}:[{} TO {}]", ip_range.start(), ip_range.end())
};
let test_sample = |sample_docs: &[Doc]| {
let mut ips: Vec<Ipv6Addr> = sample_docs.iter().map(|doc| doc.ip).collect();
ips.sort();
let ip_range = ips[0]..=ips[1];
let expected_num_hits = docs
.iter()
.filter(|doc| (ips[0]..=ips[1]).contains(&doc.ip))
.count();
let query = gen_query_inclusive("ip", &ip_range);
assert_eq!(get_num_hits(query_from_text(&query)), expected_num_hits);
let query = gen_query_inclusive("ips", &ip_range);
assert_eq!(get_num_hits(query_from_text(&query)), expected_num_hits);
// Intersection search
let id_filter = sample_docs[0].id.to_string();
let expected_num_hits = docs
.iter()
.filter(|doc| ip_range.contains(&doc.ip) && doc.id == id_filter)
.count();
let query = format!(
"{} AND id:{}",
gen_query_inclusive("ip", &ip_range),
&id_filter
);
assert_eq!(get_num_hits(query_from_text(&query)), expected_num_hits);
// Intersection search on multivalue ip field
let id_filter = sample_docs[0].id.to_string();
let query = format!(
"{} AND id:{}",
gen_query_inclusive("ips", &ip_range),
&id_filter
);
assert_eq!(get_num_hits(query_from_text(&query)), expected_num_hits);
};
test_sample(&[docs[0].clone(), docs[0].clone()]);
if docs.len() > 1 {
test_sample(&[docs[0].clone(), docs[1].clone()]);
test_sample(&[docs[1].clone(), docs[1].clone()]);
}
if docs.len() > 2 {
test_sample(&[docs[1].clone(), docs[2].clone()]);
}
Ok(())
}
}
#[cfg(all(test, feature = "unstable"))]
mod bench {
use rand::rngs::StdRng;
use rand::{Rng, SeedableRng};
use test::Bencher;
use super::tests::*;
use super::*;
use crate::collector::Count;
use crate::query::QueryParser;
use crate::Index;
fn get_index_0_to_100() -> Index {
let mut rng = StdRng::from_seed([1u8; 32]);
let num_vals = 100_000;
let docs: Vec<_> = (0..num_vals)
.map(|_i| {
let id = if rng.gen_bool(0.01) {
"veryfew".to_string() // 1%
} else if rng.gen_bool(0.1) {
"few".to_string() // 9%
} else {
"many".to_string() // 90%
};
Doc {
id,
// Multiply by 1000, so that we create many buckets in the compact space
// The benches depend on this range to select n-percent of elements with the
// methods below.
ip: Ipv6Addr::from_u128(rng.gen_range(0..100) * 1000),
}
})
.collect();
create_index_from_docs(&docs)
}
fn get_90_percent() -> RangeInclusive<Ipv6Addr> {
let start = Ipv6Addr::from_u128(0);
let end = Ipv6Addr::from_u128(90 * 1000);
start..=end
}
fn get_10_percent() -> RangeInclusive<Ipv6Addr> {
let start = Ipv6Addr::from_u128(0);
let end = Ipv6Addr::from_u128(10 * 1000);
start..=end
}
fn get_1_percent() -> RangeInclusive<Ipv6Addr> {
let start = Ipv6Addr::from_u128(10 * 1000);
let end = Ipv6Addr::from_u128(10 * 1000);
start..=end
}
fn excute_query(
field: &str,
ip_range: RangeInclusive<Ipv6Addr>,
suffix: &str,
index: &Index,
) -> usize {
let gen_query_inclusive = |from: &Ipv6Addr, to: &Ipv6Addr| {
format!(
"{}:[{} TO {}] {}",
field,
&from.to_string(),
&to.to_string(),
suffix
)
};
let query = gen_query_inclusive(ip_range.start(), ip_range.end());
let query_from_text = |text: &str| {
QueryParser::for_index(index, vec![])
.parse_query(text)
.unwrap()
};
let query = query_from_text(&query);
let reader = index.reader().unwrap();
let searcher = reader.searcher();
searcher.search(&query, &(Count)).unwrap()
}
#[bench]
fn bench_ip_range_hit_90_percent(bench: &mut Bencher) {
let index = get_index_0_to_100();
bench.iter(|| excute_query("ip", get_90_percent(), "", &index));
}
#[bench]
fn bench_ip_range_hit_10_percent(bench: &mut Bencher) {
let index = get_index_0_to_100();
bench.iter(|| excute_query("ip", get_10_percent(), "", &index));
}
#[bench]
fn bench_ip_range_hit_1_percent(bench: &mut Bencher) {
let index = get_index_0_to_100();
bench.iter(|| excute_query("ip", get_1_percent(), "", &index));
}
#[bench]
fn bench_ip_range_hit_10_percent_intersect_with_10_percent(bench: &mut Bencher) {
let index = get_index_0_to_100();
bench.iter(|| excute_query("ip", get_10_percent(), "AND id:few", &index));
}
#[bench]
fn bench_ip_range_hit_1_percent_intersect_with_10_percent(bench: &mut Bencher) {
let index = get_index_0_to_100();
bench.iter(|| excute_query("ip", get_1_percent(), "AND id:few", &index));
}
#[bench]
fn bench_ip_range_hit_1_percent_intersect_with_90_percent(bench: &mut Bencher) {
let index = get_index_0_to_100();
bench.iter(|| excute_query("ip", get_1_percent(), "AND id:many", &index));
}
#[bench]
fn bench_ip_range_hit_1_percent_intersect_with_1_percent(bench: &mut Bencher) {
let index = get_index_0_to_100();
bench.iter(|| excute_query("ip", get_1_percent(), "AND id:veryfew", &index));
}
#[bench]
fn bench_ip_range_hit_10_percent_intersect_with_90_percent(bench: &mut Bencher) {
let index = get_index_0_to_100();
bench.iter(|| excute_query("ip", get_10_percent(), "AND id:many", &index));
}
#[bench]
fn bench_ip_range_hit_90_percent_intersect_with_90_percent(bench: &mut Bencher) {
let index = get_index_0_to_100();
bench.iter(|| excute_query("ip", get_90_percent(), "AND id:many", &index));
}
#[bench]
fn bench_ip_range_hit_90_percent_intersect_with_10_percent(bench: &mut Bencher) {
let index = get_index_0_to_100();
bench.iter(|| excute_query("ip", get_90_percent(), "AND id:few", &index));
}
#[bench]
fn bench_ip_range_hit_90_percent_intersect_with_1_percent(bench: &mut Bencher) {
let index = get_index_0_to_100();
bench.iter(|| excute_query("ip", get_90_percent(), "AND id:veryfew", &index));
}
#[bench]
fn bench_ip_range_hit_90_percent_multi(bench: &mut Bencher) {
let index = get_index_0_to_100();
bench.iter(|| excute_query("ips", get_90_percent(), "", &index));
}
#[bench]
fn bench_ip_range_hit_10_percent_multi(bench: &mut Bencher) {
let index = get_index_0_to_100();
bench.iter(|| excute_query("ips", get_10_percent(), "", &index));
}
#[bench]
fn bench_ip_range_hit_1_percent_multi(bench: &mut Bencher) {
let index = get_index_0_to_100();
bench.iter(|| excute_query("ips", get_1_percent(), "", &index));
}
#[bench]
fn bench_ip_range_hit_10_percent_intersect_with_10_percent_multi(bench: &mut Bencher) {
let index = get_index_0_to_100();
bench.iter(|| excute_query("ips", get_10_percent(), "AND id:few", &index));
}
#[bench]
fn bench_ip_range_hit_1_percent_intersect_with_10_percent_multi(bench: &mut Bencher) {
let index = get_index_0_to_100();
bench.iter(|| excute_query("ips", get_1_percent(), "AND id:few", &index));
}
#[bench]
fn bench_ip_range_hit_1_percent_intersect_with_90_percent_multi(bench: &mut Bencher) {
let index = get_index_0_to_100();
bench.iter(|| excute_query("ips", get_1_percent(), "AND id:many", &index));
}
#[bench]
fn bench_ip_range_hit_1_percent_intersect_with_1_percent_multi(bench: &mut Bencher) {
let index = get_index_0_to_100();
bench.iter(|| excute_query("ips", get_1_percent(), "AND id:veryfew", &index));
}
#[bench]
fn bench_ip_range_hit_10_percent_intersect_with_90_percent_multi(bench: &mut Bencher) {
let index = get_index_0_to_100();
bench.iter(|| excute_query("ips", get_10_percent(), "AND id:many", &index));
}
#[bench]
fn bench_ip_range_hit_90_percent_intersect_with_90_percent_multi(bench: &mut Bencher) {
let index = get_index_0_to_100();
bench.iter(|| excute_query("ips", get_90_percent(), "AND id:many", &index));
}
#[bench]
fn bench_ip_range_hit_90_percent_intersect_with_10_percent_multi(bench: &mut Bencher) {
let index = get_index_0_to_100();
bench.iter(|| excute_query("ips", get_90_percent(), "AND id:few", &index));
}
#[bench]
fn bench_ip_range_hit_90_percent_intersect_with_1_percent_multi(bench: &mut Bencher) {
let index = get_index_0_to_100();
bench.iter(|| excute_query("ips", get_90_percent(), "AND id:veryfew", &index));
}
}

View File

@@ -2,54 +2,34 @@
//! We use this variant only if the fastfield exists, otherwise the default in `range_query` is
//! used, which uses the term dictionary + postings.
use std::net::Ipv6Addr;
use std::ops::{Bound, RangeInclusive};
use columnar::{ColumnType, HasAssociatedColumnType, MonotonicallyMappableToU64};
use columnar::{Column, MonotonicallyMappableToU128, MonotonicallyMappableToU64};
use common::BinarySerializable;
use super::fast_field_range_query::RangeDocSet;
use super::map_bound;
use crate::query::{ConstScorer, EmptyScorer, Explanation, Query, Scorer, Weight};
use crate::{DocId, DocSet, Score, SegmentReader, TantivyError};
use super::fast_field_range_doc_set::RangeDocSet;
use super::{map_bound, map_bound_res};
use crate::query::range_query::range_query::inner_bound;
use crate::query::{AllScorer, ConstScorer, EmptyScorer, Explanation, Query, Scorer, Weight};
use crate::schema::{Field, Type};
use crate::{DocId, DocSet, Score, SegmentReader, TantivyError, Term};
/// `FastFieldRangeWeight` uses the fast field to execute range queries.
#[derive(Clone, Debug)]
pub struct FastFieldRangeWeight {
field: String,
lower_bound: Bound<u64>,
upper_bound: Bound<u64>,
column_type_opt: Option<ColumnType>,
lower_bound: Bound<Term>,
upper_bound: Bound<Term>,
field: Field,
}
impl FastFieldRangeWeight {
/// Create a new FastFieldRangeWeight, using the u64 representation of any fast field.
pub(crate) fn new_u64_lenient(
field: String,
lower_bound: Bound<u64>,
upper_bound: Bound<u64>,
) -> Self {
let lower_bound = map_bound(&lower_bound, |val| *val);
let upper_bound = map_bound(&upper_bound, |val| *val);
/// Create a new FastFieldRangeWeight
pub(crate) fn new(field: Field, lower_bound: Bound<Term>, upper_bound: Bound<Term>) -> Self {
Self {
field,
lower_bound,
upper_bound,
column_type_opt: None,
}
}
/// Create a new `FastFieldRangeWeight` for a range of a u64-mappable type .
pub fn new<T: HasAssociatedColumnType + MonotonicallyMappableToU64>(
field: String,
lower_bound: Bound<T>,
upper_bound: Bound<T>,
) -> Self {
let lower_bound = map_bound(&lower_bound, |val| val.to_u64());
let upper_bound = map_bound(&upper_bound, |val| val.to_u64());
Self {
field,
lower_bound,
upper_bound,
column_type_opt: Some(T::column_type()),
}
}
}
@@ -65,30 +45,86 @@ impl Query for FastFieldRangeWeight {
impl Weight for FastFieldRangeWeight {
fn scorer(&self, reader: &SegmentReader, boost: Score) -> crate::Result<Box<dyn Scorer>> {
let fast_field_reader = reader.fast_fields();
let column_type_opt: Option<[ColumnType; 1]> =
self.column_type_opt.map(|column_type| [column_type]);
let column_type_opt_ref: Option<&[ColumnType]> = column_type_opt
.as_ref()
.map(|column_types| column_types.as_slice());
let Some((column, _)) =
fast_field_reader.u64_lenient_for_type(column_type_opt_ref, &self.field)?
else {
return Ok(Box::new(EmptyScorer));
};
#[allow(clippy::reversed_empty_ranges)]
let value_range = bound_to_value_range(
&self.lower_bound,
&self.upper_bound,
column.min_value(),
column.max_value(),
)
.unwrap_or(1..=0); // empty range
if value_range.is_empty() {
return Ok(Box::new(EmptyScorer));
// Check if both bounds are Bound::Unbounded
if self.lower_bound == Bound::Unbounded && self.upper_bound == Bound::Unbounded {
return Ok(Box::new(AllScorer::new(reader.max_doc())));
}
let field_name = reader.schema().get_field_name(self.field);
let field_type = reader.schema().get_field_entry(self.field).field_type();
if field_type.is_ip_addr() {
let parse_ip_from_bytes = |term: &Term| {
let ip_u128_bytes: [u8; 16] =
term.serialized_value_bytes().try_into().map_err(|_| {
crate::TantivyError::InvalidArgument(
"Expected 8 bytes for ip address".to_string(),
)
})?;
let ip_u128 = u128::from_be_bytes(ip_u128_bytes);
crate::Result::<Ipv6Addr>::Ok(Ipv6Addr::from_u128(ip_u128))
};
let lower_bound = map_bound_res(&self.lower_bound, parse_ip_from_bytes)?;
let upper_bound = map_bound_res(&self.upper_bound, parse_ip_from_bytes)?;
let Some(ip_addr_column): Option<Column<Ipv6Addr>> =
reader.fast_fields().column_opt(field_name)?
else {
return Ok(Box::new(EmptyScorer));
};
let value_range = bound_to_value_range_ip(
&lower_bound,
&upper_bound,
ip_addr_column.min_value(),
ip_addr_column.max_value(),
);
let docset = RangeDocSet::new(value_range, ip_addr_column);
Ok(Box::new(ConstScorer::new(docset, boost)))
} else {
assert!(
maps_to_u64_fastfield(field_type.value_type()),
"{:?}",
field_type
);
let term = inner_bound(&self.lower_bound)
.or(inner_bound(&self.upper_bound))
.expect("At least one bound must be set");
assert_eq!(
term.typ(),
field_type.value_type(),
"Field is of type {:?}, but got term of type {:?}",
field_type,
term.typ()
);
let parse_from_bytes = |term: &Term| {
u64::from_be(
BinarySerializable::deserialize(&mut &term.serialized_value_bytes()[..])
.unwrap(),
)
};
let lower_bound = map_bound(&self.lower_bound, parse_from_bytes);
let upper_bound = map_bound(&self.upper_bound, parse_from_bytes);
let fast_field_reader = reader.fast_fields();
let Some((column, _)) = fast_field_reader.u64_lenient_for_type(None, field_name)?
else {
return Ok(Box::new(EmptyScorer));
};
#[allow(clippy::reversed_empty_ranges)]
let value_range = bound_to_value_range(
&lower_bound,
&upper_bound,
column.min_value(),
column.max_value(),
)
.unwrap_or(1..=0); // empty range
if value_range.is_empty() {
return Ok(Box::new(EmptyScorer));
}
let docset = RangeDocSet::new(value_range, column);
Ok(Box::new(ConstScorer::new(docset, boost)))
}
let docset = RangeDocSet::new(value_range, column);
Ok(Box::new(ConstScorer::new(docset, boost)))
}
fn explain(&self, reader: &SegmentReader, doc: DocId) -> crate::Result<Explanation> {
@@ -104,6 +140,35 @@ impl Weight for FastFieldRangeWeight {
}
}
/// Returns true if the type maps to a u64 fast field
pub(crate) fn maps_to_u64_fastfield(typ: Type) -> bool {
match typ {
Type::U64 | Type::I64 | Type::F64 | Type::Bool | Type::Date => true,
Type::IpAddr => false,
Type::Str | Type::Facet | Type::Bytes | Type::Json => false,
}
}
fn bound_to_value_range_ip(
lower_bound: &Bound<Ipv6Addr>,
upper_bound: &Bound<Ipv6Addr>,
min_value: Ipv6Addr,
max_value: Ipv6Addr,
) -> RangeInclusive<Ipv6Addr> {
let start_value = match lower_bound {
Bound::Included(ip_addr) => *ip_addr,
Bound::Excluded(ip_addr) => Ipv6Addr::from(ip_addr.to_u128() + 1),
Bound::Unbounded => min_value,
};
let end_value = match upper_bound {
Bound::Included(ip_addr) => *ip_addr,
Bound::Excluded(ip_addr) => Ipv6Addr::from(ip_addr.to_u128() - 1),
Bound::Unbounded => max_value,
};
start_value..=end_value
}
// Returns None, if the range cannot be converted to a inclusive range (which equals to a empty
// range).
fn bound_to_value_range<T: MonotonicallyMappableToU64>(
@@ -141,7 +206,7 @@ pub mod tests {
use crate::query::range_query::range_query_u64_fastfield::FastFieldRangeWeight;
use crate::query::{QueryParser, Weight};
use crate::schema::{NumericOptions, Schema, SchemaBuilder, FAST, INDEXED, STORED, STRING};
use crate::{Index, IndexWriter, TERMINATED};
use crate::{Index, IndexWriter, Term, TERMINATED};
#[derive(Clone, Debug)]
pub struct Doc {
@@ -213,10 +278,10 @@ pub mod tests {
writer.add_document(doc!(field=>52_000u64)).unwrap();
writer.commit().unwrap();
let searcher = index.reader().unwrap().searcher();
let range_query = FastFieldRangeWeight::new_u64_lenient(
"test_field".to_string(),
Bound::Included(50_000),
Bound::Included(50_002),
let range_query = FastFieldRangeWeight::new(
field,
Bound::Included(Term::from_field_u64(field, 50_000)),
Bound::Included(Term::from_field_u64(field, 50_002)),
);
let scorer = range_query
.scorer(searcher.segment_reader(0), 1.0f32)
@@ -394,6 +459,202 @@ pub mod tests {
}
}
#[cfg(test)]
pub mod ip_range_tests {
use proptest::prelude::ProptestConfig;
use proptest::strategy::Strategy;
use proptest::{prop_oneof, proptest};
use super::*;
use crate::collector::Count;
use crate::query::QueryParser;
use crate::schema::{Schema, FAST, INDEXED, STORED, STRING};
use crate::{Index, IndexWriter};
#[derive(Clone, Debug)]
pub struct Doc {
pub id: String,
pub ip: Ipv6Addr,
}
fn operation_strategy() -> impl Strategy<Value = Doc> {
prop_oneof![
(0u64..10_000u64).prop_map(doc_from_id_1),
(1u64..10_000u64).prop_map(doc_from_id_2),
]
}
pub fn doc_from_id_1(id: u64) -> Doc {
let id = id * 1000;
Doc {
// ip != id
id: id.to_string(),
ip: Ipv6Addr::from_u128(id as u128),
}
}
fn doc_from_id_2(id: u64) -> Doc {
let id = id * 1000;
Doc {
// ip != id
id: (id - 1).to_string(),
ip: Ipv6Addr::from_u128(id as u128),
}
}
proptest! {
#![proptest_config(ProptestConfig::with_cases(10))]
#[test]
fn test_ip_range_for_docs_prop(ops in proptest::collection::vec(operation_strategy(), 1..1000)) {
assert!(test_ip_range_for_docs(&ops).is_ok());
}
}
#[test]
fn test_ip_range_regression1() {
let ops = &[doc_from_id_1(0)];
assert!(test_ip_range_for_docs(ops).is_ok());
}
#[test]
fn test_ip_range_regression2() {
let ops = &[
doc_from_id_1(52),
doc_from_id_1(63),
doc_from_id_1(12),
doc_from_id_2(91),
doc_from_id_2(33),
];
assert!(test_ip_range_for_docs(ops).is_ok());
}
#[test]
fn test_ip_range_regression3() {
let ops = &[doc_from_id_1(1), doc_from_id_1(2), doc_from_id_1(3)];
assert!(test_ip_range_for_docs(ops).is_ok());
}
#[test]
fn test_ip_range_regression3_simple() {
let mut schema_builder = Schema::builder();
let ips_field = schema_builder.add_ip_addr_field("ips", FAST | INDEXED);
let schema = schema_builder.build();
let index = Index::create_in_ram(schema);
let mut writer: IndexWriter = index.writer_for_tests().unwrap();
let ip_addrs: Vec<Ipv6Addr> = [1000, 2000, 3000]
.into_iter()
.map(Ipv6Addr::from_u128)
.collect();
for &ip_addr in &ip_addrs {
writer
.add_document(doc!(ips_field=>ip_addr, ips_field=>ip_addr))
.unwrap();
}
writer.commit().unwrap();
let searcher = index.reader().unwrap().searcher();
let range_weight = FastFieldRangeWeight::new(
ips_field,
Bound::Included(Term::from_field_ip_addr(ips_field, ip_addrs[1])),
Bound::Included(Term::from_field_ip_addr(ips_field, ip_addrs[2])),
);
let count =
crate::query::weight::Weight::count(&range_weight, searcher.segment_reader(0)).unwrap();
assert_eq!(count, 2);
}
pub fn create_index_from_ip_docs(docs: &[Doc]) -> Index {
let mut schema_builder = Schema::builder();
let ip_field = schema_builder.add_ip_addr_field("ip", STORED | FAST);
let ips_field = schema_builder.add_ip_addr_field("ips", FAST | INDEXED);
let text_field = schema_builder.add_text_field("id", STRING | STORED);
let schema = schema_builder.build();
let index = Index::create_in_ram(schema);
{
let mut index_writer = index.writer_with_num_threads(2, 60_000_000).unwrap();
for doc in docs.iter() {
index_writer
.add_document(doc!(
ips_field => doc.ip,
ips_field => doc.ip,
ip_field => doc.ip,
text_field => doc.id.to_string(),
))
.unwrap();
}
index_writer.commit().unwrap();
}
index
}
fn test_ip_range_for_docs(docs: &[Doc]) -> crate::Result<()> {
let index = create_index_from_ip_docs(docs);
let reader = index.reader().unwrap();
let searcher = reader.searcher();
let get_num_hits = |query| searcher.search(&query, &Count).unwrap();
let query_from_text = |text: &str| {
QueryParser::for_index(&index, vec![])
.parse_query(text)
.unwrap()
};
let gen_query_inclusive = |field: &str, ip_range: &RangeInclusive<Ipv6Addr>| {
format!("{field}:[{} TO {}]", ip_range.start(), ip_range.end())
};
let test_sample = |sample_docs: &[Doc]| {
let mut ips: Vec<Ipv6Addr> = sample_docs.iter().map(|doc| doc.ip).collect();
ips.sort();
let ip_range = ips[0]..=ips[1];
let expected_num_hits = docs
.iter()
.filter(|doc| (ips[0]..=ips[1]).contains(&doc.ip))
.count();
let query = gen_query_inclusive("ip", &ip_range);
assert_eq!(get_num_hits(query_from_text(&query)), expected_num_hits);
let query = gen_query_inclusive("ips", &ip_range);
assert_eq!(get_num_hits(query_from_text(&query)), expected_num_hits);
// Intersection search
let id_filter = sample_docs[0].id.to_string();
let expected_num_hits = docs
.iter()
.filter(|doc| ip_range.contains(&doc.ip) && doc.id == id_filter)
.count();
let query = format!(
"{} AND id:{}",
gen_query_inclusive("ip", &ip_range),
&id_filter
);
assert_eq!(get_num_hits(query_from_text(&query)), expected_num_hits);
// Intersection search on multivalue ip field
let id_filter = sample_docs[0].id.to_string();
let query = format!(
"{} AND id:{}",
gen_query_inclusive("ips", &ip_range),
&id_filter
);
assert_eq!(get_num_hits(query_from_text(&query)), expected_num_hits);
};
test_sample(&[docs[0].clone(), docs[0].clone()]);
if docs.len() > 1 {
test_sample(&[docs[0].clone(), docs[1].clone()]);
test_sample(&[docs[1].clone(), docs[1].clone()]);
}
if docs.len() > 2 {
test_sample(&[docs[1].clone(), docs[2].clone()]);
}
Ok(())
}
}
#[cfg(all(test, feature = "unstable"))]
mod bench {
@@ -601,3 +862,242 @@ mod bench {
bench.iter(|| execute_query("ids", get_90_percent(), "AND id_name:veryfew", &index));
}
}
#[cfg(all(test, feature = "unstable"))]
mod bench_ip {
use rand::rngs::StdRng;
use rand::{Rng, SeedableRng};
use test::Bencher;
use super::ip_range_tests::*;
use super::*;
use crate::collector::Count;
use crate::query::QueryParser;
use crate::Index;
fn get_index_0_to_100() -> Index {
let mut rng = StdRng::from_seed([1u8; 32]);
let num_vals = 100_000;
let docs: Vec<_> = (0..num_vals)
.map(|_i| {
let id = if rng.gen_bool(0.01) {
"veryfew".to_string() // 1%
} else if rng.gen_bool(0.1) {
"few".to_string() // 9%
} else {
"many".to_string() // 90%
};
Doc {
id,
// Multiply by 1000, so that we create many buckets in the compact space
// The benches depend on this range to select n-percent of elements with the
// methods below.
ip: Ipv6Addr::from_u128(rng.gen_range(0..100) * 1000),
}
})
.collect();
create_index_from_ip_docs(&docs)
}
fn get_90_percent() -> RangeInclusive<Ipv6Addr> {
let start = Ipv6Addr::from_u128(0);
let end = Ipv6Addr::from_u128(90 * 1000);
start..=end
}
fn get_10_percent() -> RangeInclusive<Ipv6Addr> {
let start = Ipv6Addr::from_u128(0);
let end = Ipv6Addr::from_u128(10 * 1000);
start..=end
}
fn get_1_percent() -> RangeInclusive<Ipv6Addr> {
let start = Ipv6Addr::from_u128(10 * 1000);
let end = Ipv6Addr::from_u128(10 * 1000);
start..=end
}
fn excute_query(
field: &str,
ip_range: RangeInclusive<Ipv6Addr>,
suffix: &str,
index: &Index,
) -> usize {
let gen_query_inclusive = |from: &Ipv6Addr, to: &Ipv6Addr| {
format!(
"{}:[{} TO {}] {}",
field,
&from.to_string(),
&to.to_string(),
suffix
)
};
let query = gen_query_inclusive(ip_range.start(), ip_range.end());
let query_from_text = |text: &str| {
QueryParser::for_index(index, vec![])
.parse_query(text)
.unwrap()
};
let query = query_from_text(&query);
let reader = index.reader().unwrap();
let searcher = reader.searcher();
searcher.search(&query, &(Count)).unwrap()
}
#[bench]
fn bench_ip_range_hit_90_percent(bench: &mut Bencher) {
let index = get_index_0_to_100();
bench.iter(|| excute_query("ip", get_90_percent(), "", &index));
}
#[bench]
fn bench_ip_range_hit_10_percent(bench: &mut Bencher) {
let index = get_index_0_to_100();
bench.iter(|| excute_query("ip", get_10_percent(), "", &index));
}
#[bench]
fn bench_ip_range_hit_1_percent(bench: &mut Bencher) {
let index = get_index_0_to_100();
bench.iter(|| excute_query("ip", get_1_percent(), "", &index));
}
#[bench]
fn bench_ip_range_hit_10_percent_intersect_with_10_percent(bench: &mut Bencher) {
let index = get_index_0_to_100();
bench.iter(|| excute_query("ip", get_10_percent(), "AND id:few", &index));
}
#[bench]
fn bench_ip_range_hit_1_percent_intersect_with_10_percent(bench: &mut Bencher) {
let index = get_index_0_to_100();
bench.iter(|| excute_query("ip", get_1_percent(), "AND id:few", &index));
}
#[bench]
fn bench_ip_range_hit_1_percent_intersect_with_90_percent(bench: &mut Bencher) {
let index = get_index_0_to_100();
bench.iter(|| excute_query("ip", get_1_percent(), "AND id:many", &index));
}
#[bench]
fn bench_ip_range_hit_1_percent_intersect_with_1_percent(bench: &mut Bencher) {
let index = get_index_0_to_100();
bench.iter(|| excute_query("ip", get_1_percent(), "AND id:veryfew", &index));
}
#[bench]
fn bench_ip_range_hit_10_percent_intersect_with_90_percent(bench: &mut Bencher) {
let index = get_index_0_to_100();
bench.iter(|| excute_query("ip", get_10_percent(), "AND id:many", &index));
}
#[bench]
fn bench_ip_range_hit_90_percent_intersect_with_90_percent(bench: &mut Bencher) {
let index = get_index_0_to_100();
bench.iter(|| excute_query("ip", get_90_percent(), "AND id:many", &index));
}
#[bench]
fn bench_ip_range_hit_90_percent_intersect_with_10_percent(bench: &mut Bencher) {
let index = get_index_0_to_100();
bench.iter(|| excute_query("ip", get_90_percent(), "AND id:few", &index));
}
#[bench]
fn bench_ip_range_hit_90_percent_intersect_with_1_percent(bench: &mut Bencher) {
let index = get_index_0_to_100();
bench.iter(|| excute_query("ip", get_90_percent(), "AND id:veryfew", &index));
}
#[bench]
fn bench_ip_range_hit_90_percent_multi(bench: &mut Bencher) {
let index = get_index_0_to_100();
bench.iter(|| excute_query("ips", get_90_percent(), "", &index));
}
#[bench]
fn bench_ip_range_hit_10_percent_multi(bench: &mut Bencher) {
let index = get_index_0_to_100();
bench.iter(|| excute_query("ips", get_10_percent(), "", &index));
}
#[bench]
fn bench_ip_range_hit_1_percent_multi(bench: &mut Bencher) {
let index = get_index_0_to_100();
bench.iter(|| excute_query("ips", get_1_percent(), "", &index));
}
#[bench]
fn bench_ip_range_hit_10_percent_intersect_with_10_percent_multi(bench: &mut Bencher) {
let index = get_index_0_to_100();
bench.iter(|| excute_query("ips", get_10_percent(), "AND id:few", &index));
}
#[bench]
fn bench_ip_range_hit_1_percent_intersect_with_10_percent_multi(bench: &mut Bencher) {
let index = get_index_0_to_100();
bench.iter(|| excute_query("ips", get_1_percent(), "AND id:few", &index));
}
#[bench]
fn bench_ip_range_hit_1_percent_intersect_with_90_percent_multi(bench: &mut Bencher) {
let index = get_index_0_to_100();
bench.iter(|| excute_query("ips", get_1_percent(), "AND id:many", &index));
}
#[bench]
fn bench_ip_range_hit_1_percent_intersect_with_1_percent_multi(bench: &mut Bencher) {
let index = get_index_0_to_100();
bench.iter(|| excute_query("ips", get_1_percent(), "AND id:veryfew", &index));
}
#[bench]
fn bench_ip_range_hit_10_percent_intersect_with_90_percent_multi(bench: &mut Bencher) {
let index = get_index_0_to_100();
bench.iter(|| excute_query("ips", get_10_percent(), "AND id:many", &index));
}
#[bench]
fn bench_ip_range_hit_90_percent_intersect_with_90_percent_multi(bench: &mut Bencher) {
let index = get_index_0_to_100();
bench.iter(|| excute_query("ips", get_90_percent(), "AND id:many", &index));
}
#[bench]
fn bench_ip_range_hit_90_percent_intersect_with_10_percent_multi(bench: &mut Bencher) {
let index = get_index_0_to_100();
bench.iter(|| excute_query("ips", get_90_percent(), "AND id:few", &index));
}
#[bench]
fn bench_ip_range_hit_90_percent_intersect_with_1_percent_multi(bench: &mut Bencher) {
let index = get_index_0_to_100();
bench.iter(|| excute_query("ips", get_90_percent(), "AND id:veryfew", &index));
}
}