Files
tantivy/src/query/exist_query.rs
Paul Masurel f88b7200b2 Optimization when posting list are saturated. (#2745)
* Optimization when posting list are saturated.

If a posting list doc freq is the segment reader's
max_doc, and if scoring does not matter, we can replace it
by a AllScorer.

In turn, in a boolean query, we can dismiss  all scorers and
empty scorers, to accelerate the request.

* Added range query optimization

* CR comment

* CR comments

* CR comment

---------

Co-authored-by: Paul Masurel <paul.masurel@datadoghq.com>
2025-11-26 15:50:57 +01:00

471 lines
17 KiB
Rust

use core::fmt::Debug;
use columnar::{ColumnIndex, DynamicColumn};
use common::BitSet;
use super::{ConstScorer, EmptyScorer};
use crate::docset::{DocSet, TERMINATED};
use crate::index::SegmentReader;
use crate::query::all_query::AllScorer;
use crate::query::boost_query::BoostScorer;
use crate::query::explanation::does_not_match;
use crate::query::{BitSetDocSet, EnableScoring, Explanation, Query, Scorer, Weight};
use crate::schema::Type;
use crate::{DocId, Score, TantivyError};
/// Query that matches all documents with a non-null value in the specified
/// field.
///
/// When querying inside a JSON field, "exists" queries can be executed strictly
/// on the field name or check all the subpaths. In that second case a document
/// will be matched if a non-null value exists in any subpath. For example,
/// assuming the following document where `myfield` is a JSON fast field:
/// ```json
/// {
/// "myfield": {
/// "mysubfield": "hello"
/// }
/// }
/// ```
/// With `json_subpaths` enabled queries on either `myfield` or
/// `myfield.mysubfield` will match the document. If it is set to false, only
/// `myfield.mysubfield` will match it.
///
/// All of the matched documents get the score 1.0.
#[derive(Clone, Debug)]
pub struct ExistsQuery {
field_name: String,
json_subpaths: bool,
}
impl ExistsQuery {
/// Creates a new `ExistQuery` from the given field.
///
/// This query matches all documents with at least one non-null value in the specified field.
/// This constructor never fails, but executing the search with this query will return an
/// error if the specified field doesn't exists or is not a fast field.
#[deprecated]
pub fn new_exists_query(field: String) -> ExistsQuery {
ExistsQuery {
field_name: field,
json_subpaths: false,
}
}
/// Creates a new `ExistQuery` from the given field.
///
/// This query matches all documents with at least one non-null value in the
/// specified field. If `json_subpaths` is set to true, documents with
/// non-null values in any JSON subpath will also be matched.
///
/// This constructor never fails, but executing the search with this query will
/// return an error if the specified field doesn't exists or is not a fast
/// field.
pub fn new(field: String, json_subpaths: bool) -> Self {
Self {
field_name: field,
json_subpaths,
}
}
}
impl Query for ExistsQuery {
fn weight(&self, enable_scoring: EnableScoring) -> crate::Result<Box<dyn Weight>> {
let schema = enable_scoring.schema();
let Some((field, _path)) = schema.find_field(&self.field_name) else {
return Err(TantivyError::FieldNotFound(self.field_name.clone()));
};
let field_type = schema.get_field_entry(field).field_type();
if !field_type.is_fast() {
return Err(TantivyError::SchemaError(format!(
"Field {} is not a fast field.",
self.field_name
)));
}
Ok(Box::new(ExistsWeight {
field_name: self.field_name.clone(),
field_type: field_type.value_type(),
json_subpaths: self.json_subpaths,
}))
}
}
/// Weight associated with the `ExistsQuery` query.
pub struct ExistsWeight {
field_name: String,
field_type: Type,
json_subpaths: bool,
}
impl Weight for ExistsWeight {
fn scorer(&self, reader: &SegmentReader, boost: Score) -> crate::Result<Box<dyn Scorer>> {
let fast_field_reader = reader.fast_fields();
let mut column_handles = fast_field_reader.dynamic_column_handles(&self.field_name)?;
if self.field_type == Type::Json && self.json_subpaths {
let mut sub_columns =
fast_field_reader.dynamic_subpath_column_handles(&self.field_name)?;
column_handles.append(&mut sub_columns);
}
let dynamic_columns: crate::Result<Vec<DynamicColumn>> = column_handles
.into_iter()
.map(|handle| handle.open().map_err(|io_error| io_error.into()))
.collect();
let mut non_empty_columns = Vec::new();
for column in dynamic_columns? {
if !matches!(column.column_index(), ColumnIndex::Empty { .. }) {
non_empty_columns.push(column)
}
}
if non_empty_columns.is_empty() {
return Ok(Box::new(EmptyScorer));
}
// If any column is full, all docs match.
let max_doc = reader.max_doc();
if non_empty_columns
.iter()
.any(|col| matches!(col.column_index(), ColumnIndex::Full))
{
let all_scorer = AllScorer::new(max_doc);
if boost != 1.0f32 {
return Ok(Box::new(BoostScorer::new(all_scorer, boost)));
} else {
return Ok(Box::new(all_scorer));
}
}
// If we have a single dynamic column, use ExistsDocSet
// NOTE: A lower number may be better for very sparse columns
if non_empty_columns.len() < 4 {
let docset = ExistsDocSet::new(non_empty_columns, reader.max_doc());
return Ok(Box::new(ConstScorer::new(docset, boost)));
}
// If we have many dynamic columns, precompute a bitset of matching docs
let mut doc_bitset = BitSet::with_max_value(max_doc);
for column in &non_empty_columns {
match column.column_index() {
ColumnIndex::Empty { .. } => {}
ColumnIndex::Full => {
// Handled by AllScorer return above.
}
ColumnIndex::Optional(optional_index) => {
for doc in optional_index.iter_non_null_docs() {
doc_bitset.insert(doc);
}
}
ColumnIndex::Multivalued(multi_idx) => {
for doc in multi_idx.iter_non_null_docs() {
doc_bitset.insert(doc);
}
}
}
}
let docset = BitSetDocSet::from(doc_bitset);
Ok(Box::new(ConstScorer::new(docset, boost)))
}
fn explain(&self, reader: &SegmentReader, doc: DocId) -> crate::Result<Explanation> {
let mut scorer = self.scorer(reader, 1.0)?;
if scorer.seek(doc) != doc {
return Err(does_not_match(doc));
}
Ok(Explanation::new("ExistsQuery", 1.0))
}
}
pub(crate) struct ExistsDocSet {
columns: Vec<DynamicColumn>,
doc: DocId,
max_doc: DocId,
}
impl ExistsDocSet {
pub(crate) fn new(columns: Vec<DynamicColumn>, max_doc: DocId) -> Self {
let mut set = Self {
columns,
doc: 0u32,
max_doc,
};
set.find_next();
set
}
fn find_next(&mut self) -> DocId {
while self.doc < self.max_doc {
if self
.columns
.iter()
.any(|col| col.column_index().has_value(self.doc))
{
return self.doc;
}
self.doc += 1;
}
self.doc = TERMINATED;
TERMINATED
}
}
impl DocSet for ExistsDocSet {
fn advance(&mut self) -> DocId {
self.seek(self.doc + 1)
}
fn size_hint(&self) -> u32 {
0
}
fn doc(&self) -> DocId {
self.doc
}
#[inline(always)]
fn seek(&mut self, target: DocId) -> DocId {
self.doc = target;
self.find_next()
}
}
#[cfg(test)]
mod tests {
use std::net::Ipv6Addr;
use std::ops::Bound;
use common::DateTime;
use time::OffsetDateTime;
use crate::collector::Count;
use crate::query::exist_query::ExistsQuery;
use crate::query::{BooleanQuery, RangeQuery};
use crate::schema::{Facet, FacetOptions, Schema, FAST, INDEXED, STRING, TEXT};
use crate::{Index, Searcher, Term};
#[test]
fn test_exists_query_simple() -> crate::Result<()> {
let mut schema_builder = Schema::builder();
let all_field = schema_builder.add_u64_field("all", INDEXED | FAST);
let even_field = schema_builder.add_u64_field("even", INDEXED | FAST);
let odd_field = schema_builder.add_text_field("odd", STRING | FAST);
let multi_field = schema_builder.add_text_field("multi", FAST);
let _never_field = schema_builder.add_u64_field("never", INDEXED | FAST);
let schema = schema_builder.build();
let index = Index::create_in_ram(schema);
{
let mut index_writer = index.writer_for_tests()?;
for i in 0u64..100u64 {
if i % 2 == 0 {
if i % 10 == 0 {
index_writer.add_document(doc!(all_field => i, even_field => i, multi_field => i.to_string(), multi_field => (i + 1).to_string()))?;
} else {
index_writer.add_document(doc!(all_field => i, even_field => i))?;
}
} else {
index_writer.add_document(doc!(all_field => i, odd_field => i.to_string()))?;
}
}
index_writer.commit()?;
}
let reader = index.reader()?;
let searcher = reader.searcher();
assert_eq!(count_existing_fields(&searcher, "all", false)?, 100);
assert_eq!(count_existing_fields(&searcher, "odd", false)?, 50);
assert_eq!(count_existing_fields(&searcher, "even", false)?, 50);
assert_eq!(count_existing_fields(&searcher, "multi", false)?, 10);
assert_eq!(count_existing_fields(&searcher, "multi", true)?, 10);
assert_eq!(count_existing_fields(&searcher, "never", false)?, 0);
// exercise seek
let query = BooleanQuery::intersection(vec![
Box::new(RangeQuery::new(
Bound::Included(Term::from_field_u64(all_field, 50)),
Bound::Unbounded,
)),
Box::new(ExistsQuery::new("even".to_string(), false)),
]);
assert_eq!(searcher.search(&query, &Count)?, 25);
let query = BooleanQuery::intersection(vec![
Box::new(RangeQuery::new(
Bound::Included(Term::from_field_u64(all_field, 0)),
Bound::Included(Term::from_field_u64(all_field, 50)),
)),
Box::new(ExistsQuery::new("odd".to_string(), false)),
]);
assert_eq!(searcher.search(&query, &Count)?, 25);
Ok(())
}
#[test]
fn test_exists_query_json() -> crate::Result<()> {
let mut schema_builder = Schema::builder();
let json = schema_builder.add_json_field("json", TEXT | FAST);
let schema = schema_builder.build();
let index = Index::create_in_ram(schema);
{
let mut index_writer = index.writer_for_tests()?;
for i in 0u64..100u64 {
if i % 2 == 0 {
index_writer.add_document(doc!(json => json!({"all": i, "even": true})))?;
} else {
index_writer
.add_document(doc!(json => json!({"all": i.to_string(), "odd": true})))?;
}
}
index_writer.commit()?;
}
let reader = index.reader()?;
let searcher = reader.searcher();
assert_eq!(count_existing_fields(&searcher, "json.all", false)?, 100);
assert_eq!(count_existing_fields(&searcher, "json.even", false)?, 50);
assert_eq!(count_existing_fields(&searcher, "json.even", true)?, 50);
assert_eq!(count_existing_fields(&searcher, "json.odd", false)?, 50);
assert_eq!(count_existing_fields(&searcher, "json", false)?, 0);
assert_eq!(count_existing_fields(&searcher, "json", true)?, 100);
// Handling of non-existing fields:
assert_eq!(count_existing_fields(&searcher, "json.absent", false)?, 0);
assert_eq!(count_existing_fields(&searcher, "json.absent", true)?, 0);
assert_does_not_exist(&searcher, "does_not_exists.absent", true);
assert_does_not_exist(&searcher, "does_not_exists.absent", false);
Ok(())
}
#[test]
fn test_exists_query_json_union_no_single_full_subpath() -> crate::Result<()> {
// Build docs where no single subpath exists for all docs, but the union does.
let mut schema_builder = Schema::builder();
let json = schema_builder.add_json_field("json", TEXT | FAST);
let schema = schema_builder.build();
let index = Index::create_in_ram(schema);
{
let mut index_writer = index.writer_for_tests()?;
for i in 0u64..100u64 {
if i % 2 == 0 {
// only subpath `a`
index_writer.add_document(doc!(json => json!({"a": i})))?;
} else {
// only subpath `b`
index_writer.add_document(doc!(json => json!({"b": i})))?;
}
}
index_writer.commit()?;
}
let reader = index.reader()?;
let searcher = reader.searcher();
// No single subpath is full
assert_eq!(count_existing_fields(&searcher, "json.a", false)?, 50);
assert_eq!(count_existing_fields(&searcher, "json.b", false)?, 50);
// Root exists with subpaths disabled is zero
assert_eq!(count_existing_fields(&searcher, "json", false)?, 0);
// Root exists with subpaths enabled should match all docs via union
assert_eq!(count_existing_fields(&searcher, "json", true)?, 100);
Ok(())
}
#[test]
fn test_exists_query_misc_supported_types() -> crate::Result<()> {
let mut schema_builder = Schema::builder();
let bool = schema_builder.add_bool_field("bool", FAST);
let bytes = schema_builder.add_bytes_field("bytes", FAST);
let date = schema_builder.add_date_field("date", FAST);
let f64 = schema_builder.add_f64_field("f64", FAST);
let ip_addr = schema_builder.add_ip_addr_field("ip_addr", FAST);
let facet = schema_builder.add_facet_field("facet", FacetOptions::default());
let schema = schema_builder.build();
let index = Index::create_in_ram(schema);
{
let mut index_writer = index.writer_for_tests()?;
let now = OffsetDateTime::now_utc().unix_timestamp();
for i in 0u8..100u8 {
if i % 2 == 0 {
let date_val = DateTime::from_utc(OffsetDateTime::from_unix_timestamp(
now + i as i64 * 100,
)?);
index_writer.add_document(
doc!(bool => i % 3 == 0, bytes => vec![i, i + 1, i + 2], date => date_val),
)?;
} else {
let ip_addr_v6 = Ipv6Addr::new(0, 0, 0, 0, 0, 0xffff, 0xc00a, i.into());
index_writer
.add_document(doc!(f64 => i as f64 * 0.5, ip_addr => ip_addr_v6, facet => Facet::from("/facet/foo"), facet => Facet::from("/facet/bar")))?;
}
}
index_writer.commit()?;
}
let reader = index.reader()?;
let searcher = reader.searcher();
assert_eq!(count_existing_fields(&searcher, "bool", false)?, 50);
assert_eq!(count_existing_fields(&searcher, "bool", true)?, 50);
assert_eq!(count_existing_fields(&searcher, "bytes", false)?, 50);
assert_eq!(count_existing_fields(&searcher, "date", false)?, 50);
assert_eq!(count_existing_fields(&searcher, "f64", false)?, 50);
assert_eq!(count_existing_fields(&searcher, "ip_addr", false)?, 50);
assert_eq!(count_existing_fields(&searcher, "facet", false)?, 50);
Ok(())
}
#[test]
fn test_exists_query_unsupported_types() -> crate::Result<()> {
let mut schema_builder = Schema::builder();
let not_fast = schema_builder.add_text_field("not_fast", TEXT);
let schema = schema_builder.build();
let index = Index::create_in_ram(schema);
{
let mut index_writer = index.writer_for_tests()?;
index_writer.add_document(doc!(
not_fast => "slow",
))?;
index_writer.commit()?;
}
let reader = index.reader()?;
let searcher = reader.searcher();
assert_eq!(
searcher
.search(&ExistsQuery::new("not_fast".to_string(), false), &Count)
.unwrap_err()
.to_string(),
"Schema error: 'Field not_fast is not a fast field.'"
);
assert_does_not_exist(&searcher, "does_not_exists", false);
Ok(())
}
fn count_existing_fields(
searcher: &Searcher,
field: &str,
json_subpaths: bool,
) -> crate::Result<usize> {
let query = ExistsQuery::new(field.to_string(), json_subpaths);
searcher.search(&query, &Count)
}
fn assert_does_not_exist(searcher: &Searcher, field: &str, json_subpaths: bool) {
assert_eq!(
searcher
.search(&ExistsQuery::new(field.to_string(), json_subpaths), &Count)
.unwrap_err()
.to_string(),
format!("The field does not exist: '{field}'")
);
}
}