add support for str fast field range query (#2453)

* add support for str fast field range query

Add support for range queries on fast fields, by converting term bounds to
term ordinals bounds.

closes https://github.com/quickwit-oss/tantivy/issues/2023

* extend tests, rename

* update comment

* update comment
This commit is contained in:
PSeitz
2024-07-17 10:31:42 +09:00
committed by GitHub
parent 1b4076691f
commit 7ebcc15b17
4 changed files with 397 additions and 47 deletions

View File

@@ -12,9 +12,9 @@ pub use self::range_query_u64_fastfield::FastFieldRangeWeight;
// TODO is this correct?
pub(crate) fn is_type_valid_for_fastfield_range_query(typ: Type) -> bool {
match typ {
Type::U64 | Type::I64 | Type::F64 | Type::Bool | Type::Date => true,
Type::Str | Type::U64 | Type::I64 | Type::F64 | Type::Bool | Type::Date => true,
Type::IpAddr => true,
Type::Str | Type::Facet | Type::Bytes | Type::Json => false,
Type::Facet | Type::Bytes | Type::Json => false,
}
}

View File

@@ -5,7 +5,7 @@
use std::net::Ipv6Addr;
use std::ops::{Bound, RangeInclusive};
use columnar::{Column, MonotonicallyMappableToU128, MonotonicallyMappableToU64};
use columnar::{Column, MonotonicallyMappableToU128, MonotonicallyMappableToU64, StrColumn};
use common::BinarySerializable;
use super::fast_field_range_doc_set::RangeDocSet;
@@ -51,16 +51,22 @@ impl Weight for FastFieldRangeWeight {
}
let field_name = reader.schema().get_field_name(self.field);
let field_type = reader.schema().get_field_entry(self.field).field_type();
let term = inner_bound(&self.lower_bound)
.or(inner_bound(&self.upper_bound))
.expect("At least one bound must be set");
assert_eq!(
term.typ(),
field_type.value_type(),
"Field is of type {:?}, but got term of type {:?}",
field_type,
term.typ()
);
if field_type.is_ip_addr() {
let parse_ip_from_bytes = |term: &Term| {
let ip_u128_bytes: [u8; 16] =
term.serialized_value_bytes().try_into().map_err(|_| {
crate::TantivyError::InvalidArgument(
"Expected 8 bytes for ip address".to_string(),
)
})?;
let ip_u128 = u128::from_be_bytes(ip_u128_bytes);
crate::Result::<Ipv6Addr>::Ok(Ipv6Addr::from_u128(ip_u128))
term.value().as_ip_addr().ok_or_else(|| {
crate::TantivyError::InvalidArgument("Expected ip address".to_string())
})
};
let lower_bound = map_bound_res(&self.lower_bound, parse_ip_from_bytes)?;
let upper_bound = map_bound_res(&self.upper_bound, parse_ip_from_bytes)?;
@@ -79,33 +85,42 @@ impl Weight for FastFieldRangeWeight {
let docset = RangeDocSet::new(value_range, ip_addr_column);
Ok(Box::new(ConstScorer::new(docset, boost)))
} else {
assert!(
maps_to_u64_fastfield(field_type.value_type()),
"{:?}",
field_type
);
let (lower_bound, upper_bound) = if field_type.is_str() {
let Some(str_dict_column): Option<StrColumn> =
reader.fast_fields().str(field_name)?
else {
return Ok(Box::new(EmptyScorer));
};
let dict = str_dict_column.dictionary();
let term = inner_bound(&self.lower_bound)
.or(inner_bound(&self.upper_bound))
.expect("At least one bound must be set");
assert_eq!(
term.typ(),
field_type.value_type(),
"Field is of type {:?}, but got term of type {:?}",
field_type,
term.typ()
);
let lower_bound = map_bound(&self.lower_bound, |term| {
term.serialized_value_bytes().to_vec()
});
let upper_bound = map_bound(&self.upper_bound, |term| {
term.serialized_value_bytes().to_vec()
});
// Get term ids for terms
let (lower_bound, upper_bound) =
dict.term_bounds_to_ord(lower_bound, upper_bound)?;
(lower_bound, upper_bound)
} else {
assert!(
maps_to_u64_fastfield(field_type.value_type()),
"{:?}",
field_type
);
let parse_from_bytes = |term: &Term| {
u64::from_be(
BinarySerializable::deserialize(&mut &term.serialized_value_bytes()[..])
.unwrap(),
)
};
let parse_from_bytes = |term: &Term| {
u64::from_be(
BinarySerializable::deserialize(&mut &term.serialized_value_bytes()[..])
.unwrap(),
)
let lower_bound = map_bound(&self.lower_bound, parse_from_bytes);
let upper_bound = map_bound(&self.upper_bound, parse_from_bytes);
(lower_bound, upper_bound)
};
let lower_bound = map_bound(&self.lower_bound, parse_from_bytes);
let upper_bound = map_bound(&self.upper_bound, parse_from_bytes);
let fast_field_reader = reader.fast_fields();
let Some((column, _)) = fast_field_reader.u64_lenient_for_type(None, field_name)?
else {
@@ -202,12 +217,73 @@ pub mod tests {
use rand::seq::SliceRandom;
use rand::SeedableRng;
use crate::collector::Count;
use crate::collector::{Count, TopDocs};
use crate::query::range_query::range_query_u64_fastfield::FastFieldRangeWeight;
use crate::query::{QueryParser, Weight};
use crate::schema::{NumericOptions, Schema, SchemaBuilder, FAST, INDEXED, STORED, STRING};
use crate::schema::{
NumericOptions, Schema, SchemaBuilder, FAST, INDEXED, STORED, STRING, TEXT,
};
use crate::{Index, IndexWriter, Term, TERMINATED};
#[test]
fn test_text_field_ff_range_query() -> crate::Result<()> {
let mut schema_builder = Schema::builder();
schema_builder.add_text_field("title", TEXT | FAST);
let schema = schema_builder.build();
let index = Index::create_in_ram(schema.clone());
let mut index_writer = index.writer_for_tests()?;
let title = schema.get_field("title").unwrap();
index_writer.add_document(doc!(
title => "bbb"
))?;
index_writer.add_document(doc!(
title => "ddd"
))?;
index_writer.commit()?;
let reader = index.reader()?;
let searcher = reader.searcher();
let query_parser = QueryParser::for_index(&index, vec![title]);
let test_query = |query, num_hits| {
let query = query_parser.parse_query(query).unwrap();
let top_docs = searcher.search(&query, &TopDocs::with_limit(10)).unwrap();
assert_eq!(top_docs.len(), num_hits);
};
test_query("title:[aaa TO ccc]", 1);
test_query("title:[aaa TO bbb]", 1);
test_query("title:[bbb TO bbb]", 1);
test_query("title:[bbb TO ddd]", 2);
test_query("title:[bbb TO eee]", 2);
test_query("title:[bb TO eee]", 2);
test_query("title:[ccc TO ccc]", 0);
test_query("title:[ccc TO ddd]", 1);
test_query("title:[ccc TO eee]", 1);
test_query("title:[aaa TO *}", 2);
test_query("title:[bbb TO *]", 2);
test_query("title:[bb TO *]", 2);
test_query("title:[ccc TO *]", 1);
test_query("title:[ddd TO *]", 1);
test_query("title:[dddd TO *]", 0);
test_query("title:{aaa TO *}", 2);
test_query("title:{bbb TO *]", 1);
test_query("title:{bb TO *]", 2);
test_query("title:{ccc TO *]", 1);
test_query("title:{ddd TO *]", 0);
test_query("title:{dddd TO *]", 0);
test_query("title:[* TO bb]", 0);
test_query("title:[* TO bbb]", 1);
test_query("title:[* TO ccc]", 1);
test_query("title:[* TO ddd]", 2);
test_query("title:[* TO ddd}", 1);
test_query("title:[* TO eee]", 2);
Ok(())
}
#[derive(Clone, Debug)]
pub struct Doc {
pub id_name: String,
@@ -224,14 +300,14 @@ pub mod tests {
fn doc_from_id_1(id: u64) -> Doc {
let id = id * 1000;
Doc {
id_name: id.to_string(),
id_name: format!("id_name{:010}", id),
id,
}
}
fn doc_from_id_2(id: u64) -> Doc {
let id = id * 1000;
Doc {
id_name: (id - 1).to_string(),
id_name: format!("id_name{:010}", id - 1),
id,
}
}
@@ -319,7 +395,8 @@ pub mod tests {
NumericOptions::default().set_fast().set_indexed(),
);
let text_field = schema_builder.add_text_field("id_name", STRING | STORED);
let text_field = schema_builder.add_text_field("id_name", STRING | STORED | FAST);
let text_field2 = schema_builder.add_text_field("id_name_fast", STRING | STORED | FAST);
let schema = schema_builder.build();
let index = Index::create_in_ram(schema);
@@ -338,6 +415,7 @@ pub mod tests {
id_f64_field => doc.id as f64,
id_i64_field => doc.id as i64,
text_field => doc.id_name.to_string(),
text_field2 => doc.id_name.to_string(),
))
.unwrap();
}
@@ -382,6 +460,24 @@ pub mod tests {
let query = gen_query_inclusive("ids", ids[0]..=ids[1]);
assert_eq!(get_num_hits(query_from_text(&query)), expected_num_hits);
// Text query
{
let test_text_query = |field_name: &str| {
let mut id_names: Vec<&str> =
sample_docs.iter().map(|doc| doc.id_name.as_str()).collect();
id_names.sort();
let expected_num_hits = docs
.iter()
.filter(|doc| (id_names[0]..=id_names[1]).contains(&doc.id_name.as_str()))
.count();
let query = format!("{}:[{} TO {}]", field_name, id_names[0], id_names[1]);
assert_eq!(get_num_hits(query_from_text(&query)), expected_num_hits);
};
test_text_query("id_name");
test_text_query("id_name_fast");
}
// Exclusive range
let expected_num_hits = docs
.iter()

View File

@@ -201,6 +201,11 @@ impl FieldType {
matches!(self, FieldType::IpAddr(_))
}
/// returns true if this is an str field
pub fn is_str(&self) -> bool {
matches!(self, FieldType::Str(_))
}
/// returns true if this is an date field
pub fn is_date(&self) -> bool {
matches!(self, FieldType::Date(_))

View File

@@ -56,6 +56,53 @@ impl Dictionary<VoidSSTable> {
}
}
fn map_bound<TFrom, TTo>(bound: &Bound<TFrom>, transform: impl Fn(&TFrom) -> TTo) -> Bound<TTo> {
use self::Bound::*;
match bound {
Excluded(ref from_val) => Bound::Excluded(transform(from_val)),
Included(ref from_val) => Bound::Included(transform(from_val)),
Unbounded => Unbounded,
}
}
/// Takes a bound and transforms the inner value into a new bound via a closure.
/// The bound variant may change by the value returned value from the closure.
fn transform_bound_inner<TFrom, TTo>(
bound: &Bound<TFrom>,
transform: impl Fn(&TFrom) -> io::Result<Bound<TTo>>,
) -> io::Result<Bound<TTo>> {
use self::Bound::*;
Ok(match bound {
Excluded(ref from_val) => transform(from_val)?,
Included(ref from_val) => transform(from_val)?,
Unbounded => Unbounded,
})
}
#[derive(Debug, Clone, PartialEq, Eq)]
pub enum TermOrdHit {
/// Exact term ord hit
Exact(TermOrdinal),
/// Next best term ordinal
Next(TermOrdinal),
}
impl TermOrdHit {
fn into_exact(self) -> Option<TermOrdinal> {
match self {
TermOrdHit::Exact(ord) => Some(ord),
TermOrdHit::Next(_) => None,
}
}
fn map<F: FnOnce(TermOrdinal) -> TermOrdinal>(self, f: F) -> Self {
match self {
TermOrdHit::Exact(ord) => TermOrdHit::Exact(f(ord)),
TermOrdHit::Next(ord) => TermOrdHit::Next(f(ord)),
}
}
}
impl<TSSTable: SSTable> Dictionary<TSSTable> {
pub fn builder<W: io::Write>(wrt: W) -> io::Result<crate::Writer<W, TSSTable::ValueWriter>> {
Ok(TSSTable::writer(wrt))
@@ -257,6 +304,17 @@ impl<TSSTable: SSTable> Dictionary<TSSTable> {
key: K,
sstable_delta_reader: &mut DeltaReader<TSSTable::ValueReader>,
) -> io::Result<Option<TermOrdinal>> {
self.decode_up_to_or_next(key, sstable_delta_reader)
.map(|hit| hit.into_exact())
}
/// Decode a DeltaReader up to key, returning the number of terms traversed
///
/// If the key was not found, it returns the next term id.
fn decode_up_to_or_next<K: AsRef<[u8]>>(
&self,
key: K,
sstable_delta_reader: &mut DeltaReader<TSSTable::ValueReader>,
) -> io::Result<TermOrdHit> {
let mut term_ord = 0;
let key_bytes = key.as_ref();
let mut ok_bytes = 0;
@@ -265,7 +323,7 @@ impl<TSSTable: SSTable> Dictionary<TSSTable> {
let suffix = sstable_delta_reader.suffix();
match prefix_len.cmp(&ok_bytes) {
Ordering::Less => return Ok(None), // popped bytes already matched => too far
Ordering::Less => return Ok(TermOrdHit::Next(term_ord)), /* popped bytes already matched => too far */
Ordering::Equal => (),
Ordering::Greater => {
// the ok prefix is less than current entry prefix => continue to next elem
@@ -277,25 +335,26 @@ impl<TSSTable: SSTable> Dictionary<TSSTable> {
// we have ok_bytes byte of common prefix, check if this key adds more
for (key_byte, suffix_byte) in key_bytes[ok_bytes..].iter().zip(suffix) {
match suffix_byte.cmp(key_byte) {
Ordering::Less => break, // byte too small
Ordering::Equal => ok_bytes += 1, // new matching byte
Ordering::Greater => return Ok(None), // too far
Ordering::Less => break, // byte too small
Ordering::Equal => ok_bytes += 1, // new matching
// byte
Ordering::Greater => return Ok(TermOrdHit::Next(term_ord)), // too far
}
}
if ok_bytes == key_bytes.len() {
if prefix_len + suffix.len() == ok_bytes {
return Ok(Some(term_ord));
return Ok(TermOrdHit::Exact(term_ord));
} else {
// current key is a prefix of current element, not a match
return Ok(None);
return Ok(TermOrdHit::Next(term_ord));
}
}
term_ord += 1;
}
Ok(None)
Ok(TermOrdHit::Next(term_ord))
}
/// Returns the ordinal associated with a given term.
@@ -312,6 +371,61 @@ impl<TSSTable: SSTable> Dictionary<TSSTable> {
.map(|opt| opt.map(|ord| ord + first_ordinal))
}
/// Returns the ordinal associated with a given term or its closest next term_id
/// The closest next term_id may not exist.
pub fn term_ord_or_next<K: AsRef<[u8]>>(&self, key: K) -> io::Result<TermOrdHit> {
let key_bytes = key.as_ref();
let Some(block_addr) = self.sstable_index.get_block_with_key(key_bytes) else {
// TODO: Would be more consistent to return last_term id + 1
return Ok(TermOrdHit::Next(u64::MAX));
};
let first_ordinal = block_addr.first_ordinal;
let mut sstable_delta_reader = self.sstable_delta_reader_block(block_addr)?;
self.decode_up_to_or_next(key_bytes, &mut sstable_delta_reader)
.map(|opt| opt.map(|ord| ord + first_ordinal))
}
/// Converts strings into a Bound range.
/// This does handle several special cases if the term is not exactly in the dictionary.
/// e.g. [bbb, ddd]
/// lower_bound: Bound::Included(aaa) => Included(0) // "Next" term id
/// lower_bound: Bound::Excluded(aaa) => Included(0) // "Next" term id + Change the Bounds
/// lower_bound: Bound::Included(ccc) => Included(1) // "Next" term id
/// lower_bound: Bound::Excluded(ccc) => Included(1) // "Next" term id + Change the Bounds
/// lower_bound: Bound::Included(zzz) => Included(2) // "Next" term id
/// lower_bound: Bound::Excluded(zzz) => Included(2) // "Next" term id + Change the Bounds
/// For zzz we should have some post processing to return an empty query`
///
/// upper_bound: Bound::Included(aaa) => Excluded(0) // "Next" term id + Change the bounds
/// upper_bound: Bound::Excluded(aaa) => Excluded(0) // "Next" term id
/// upper_bound: Bound::Included(ccc) => Excluded(1) // Next term id + Change the bounds
/// upper_bound: Bound::Excluded(ccc) => Excluded(1) // Next term id
/// upper_bound: Bound::Included(zzz) => Excluded(2) // Next term id + Change the bounds
/// upper_bound: Bound::Excluded(zzz) => Excluded(2) // Next term id
pub fn term_bounds_to_ord<K: AsRef<[u8]>>(
&self,
lower_bound: Bound<K>,
upper_bound: Bound<K>,
) -> io::Result<(Bound<TermOrdinal>, Bound<TermOrdinal>)> {
let lower_bound = transform_bound_inner(&lower_bound, |start_bound_bytes| {
let ord = self.term_ord_or_next(start_bound_bytes)?;
match ord {
TermOrdHit::Exact(ord) => Ok(map_bound(&lower_bound, |_| ord)),
TermOrdHit::Next(ord) => Ok(Bound::Included(ord)), // Change bounds to included
}
})?;
let upper_bound = transform_bound_inner(&upper_bound, |end_bound_bytes| {
let ord = self.term_ord_or_next(end_bound_bytes)?;
match ord {
TermOrdHit::Exact(ord) => Ok(map_bound(&upper_bound, |_| ord)),
TermOrdHit::Next(ord) => Ok(Bound::Excluded(ord)), // Change bounds to excluded
}
})?;
Ok((lower_bound, upper_bound))
}
/// Returns the term associated with a given term ordinal.
///
/// Term ordinals are defined as the position of the term in
@@ -455,12 +569,13 @@ impl<TSSTable: SSTable> Dictionary<TSSTable> {
#[cfg(test)]
mod tests {
use std::ops::Range;
use std::ops::{Bound, Range};
use std::sync::{Arc, Mutex};
use common::OwnedBytes;
use super::Dictionary;
use crate::dictionary::TermOrdHit;
use crate::MonotonicU64SSTable;
#[derive(Debug)]
@@ -524,6 +639,140 @@ mod tests {
(dictionary, table)
}
#[test]
fn test_term_to_ord_or_next() {
let dict = {
let mut builder = Dictionary::<MonotonicU64SSTable>::builder(Vec::new()).unwrap();
builder.insert(b"bbb", &1).unwrap();
builder.insert(b"ddd", &2).unwrap();
let table = builder.finish().unwrap();
let table = Arc::new(PermissionedHandle::new(table));
let slice = common::file_slice::FileSlice::new(table.clone());
Dictionary::<MonotonicU64SSTable>::open(slice).unwrap()
};
assert_eq!(dict.term_ord_or_next(b"aaa").unwrap(), TermOrdHit::Next(0));
assert_eq!(dict.term_ord_or_next(b"bbb").unwrap(), TermOrdHit::Exact(0));
assert_eq!(dict.term_ord_or_next(b"bb").unwrap(), TermOrdHit::Next(0));
assert_eq!(dict.term_ord_or_next(b"bbbb").unwrap(), TermOrdHit::Next(1));
assert_eq!(dict.term_ord_or_next(b"dd").unwrap(), TermOrdHit::Next(1));
assert_eq!(dict.term_ord_or_next(b"ddd").unwrap(), TermOrdHit::Exact(1));
assert_eq!(dict.term_ord_or_next(b"dddd").unwrap(), TermOrdHit::Next(2));
// This is not u64::MAX because for very small sstables (only one block),
// we don't store an index, and the pseudo-index always reply that the
// answer lies in block number 0
assert_eq!(
dict.term_ord_or_next(b"zzzzzzz").unwrap(),
TermOrdHit::Next(2)
);
}
#[test]
fn test_term_to_ord_or_next_2() {
let dict = {
let mut builder = Dictionary::<MonotonicU64SSTable>::builder(Vec::new()).unwrap();
let mut term_ord = 0;
builder.insert(b"bbb", &term_ord).unwrap();
// Fill blocks in between
for elem in 0..50_000 {
term_ord += 1;
let key = format!("ccccc{elem:05X}").into_bytes();
builder.insert(&key, &term_ord).unwrap();
}
term_ord += 1;
builder.insert(b"eee", &term_ord).unwrap();
let table = builder.finish().unwrap();
let table = Arc::new(PermissionedHandle::new(table));
let slice = common::file_slice::FileSlice::new(table.clone());
Dictionary::<MonotonicU64SSTable>::open(slice).unwrap()
};
assert_eq!(dict.term_ord(b"bbb").unwrap(), Some(0));
assert_eq!(dict.term_ord_or_next(b"bbb").unwrap(), TermOrdHit::Exact(0));
assert_eq!(dict.term_ord_or_next(b"aaa").unwrap(), TermOrdHit::Next(0));
assert_eq!(dict.term_ord_or_next(b"bb").unwrap(), TermOrdHit::Next(0));
assert_eq!(dict.term_ord_or_next(b"bbbb").unwrap(), TermOrdHit::Next(1));
assert_eq!(
dict.term_ord_or_next(b"ee").unwrap(),
TermOrdHit::Next(50001)
);
assert_eq!(
dict.term_ord_or_next(b"eee").unwrap(),
TermOrdHit::Exact(50001)
);
assert_eq!(
dict.term_ord_or_next(b"eeee").unwrap(),
TermOrdHit::Next(u64::MAX)
);
assert_eq!(
dict.term_ord_or_next(b"zzzzzzz").unwrap(),
TermOrdHit::Next(u64::MAX)
);
}
#[test]
fn test_term_bounds_to_ord() {
let dict = {
let mut builder = Dictionary::<MonotonicU64SSTable>::builder(Vec::new()).unwrap();
builder.insert(b"bbb", &1).unwrap();
builder.insert(b"ddd", &2).unwrap();
let table = builder.finish().unwrap();
let table = Arc::new(PermissionedHandle::new(table));
let slice = common::file_slice::FileSlice::new(table.clone());
Dictionary::<MonotonicU64SSTable>::open(slice).unwrap()
};
// Test cases for lower_bound
let test_lower_bound = |bound, expected| {
assert_eq!(
dict.term_bounds_to_ord::<&[u8]>(bound, Bound::Included(b"ignored"))
.unwrap()
.0,
expected
);
};
test_lower_bound(Bound::Included(b"aaa".as_slice()), Bound::Included(0));
test_lower_bound(Bound::Excluded(b"aaa".as_slice()), Bound::Included(0));
test_lower_bound(Bound::Included(b"bbb".as_slice()), Bound::Included(0));
test_lower_bound(Bound::Excluded(b"bbb".as_slice()), Bound::Excluded(0));
test_lower_bound(Bound::Included(b"ccc".as_slice()), Bound::Included(1));
test_lower_bound(Bound::Excluded(b"ccc".as_slice()), Bound::Included(1));
test_lower_bound(Bound::Included(b"zzz".as_slice()), Bound::Included(2));
test_lower_bound(Bound::Excluded(b"zzz".as_slice()), Bound::Included(2));
// Test cases for upper_bound
let test_upper_bound = |bound, expected| {
assert_eq!(
dict.term_bounds_to_ord::<&[u8]>(Bound::Included(b"ignored"), bound,)
.unwrap()
.1,
expected
);
};
test_upper_bound(Bound::Included(b"ccc".as_slice()), Bound::Excluded(1));
test_upper_bound(Bound::Excluded(b"ccc".as_slice()), Bound::Excluded(1));
test_upper_bound(Bound::Included(b"zzz".as_slice()), Bound::Excluded(2));
test_upper_bound(Bound::Excluded(b"zzz".as_slice()), Bound::Excluded(2));
test_upper_bound(Bound::Included(b"ddd".as_slice()), Bound::Included(1));
test_upper_bound(Bound::Excluded(b"ddd".as_slice()), Bound::Excluded(1));
}
#[test]
fn test_ord_term_conversion() {
let (dic, slice) = make_test_sstable();