mirror of
https://github.com/quickwit-oss/tantivy.git
synced 2026-06-02 16:40:43 +00:00
Extract trait interfaces from tantivy's core reader types so that alternative storage backends (e.g. Quickwit) can provide their own implementations while tantivy's query engine works through dynamic dispatch. Reader trait extraction: - SegmentReader is now a trait; the concrete implementation is renamed to TantivySegmentReader. - DynInvertedIndexReader trait for object-safe dynamic dispatch, plus a typed InvertedIndexReader trait with associated Postings/DocSet types for static dispatch. The concrete reader becomes TantivyInvertedIndexReader. - StoreReader is now a trait; the concrete implementation is renamed to TantivyStoreReader. get() returns TantivyDocument directly instead of requiring a generic DocumentDeserialize bound. Typed downcast for performance-critical paths: - try_downcast_and_call() + TypedInvertedIndexReaderCb allow query weights (TermWeight, PhraseWeight) to attempt a downcast to the concrete TantivyInvertedIndexReader, obtaining typed postings for zero-cost scoring, and falling back to the dynamic path otherwise. - TermScorer<TPostings> is now generic over its postings type. - PostingsWithBlockMax trait enables block-max WAND acceleration through the trait boundary. - block_wand() and block_wand_single_scorer() are generic over PostingsWithBlockMax, and for_each_pruning is dispatched through the SegmentReader trait so custom backends can provide their own block-max implementations. Searcher decoupled from Index: - New SearcherContext holds schema, executor, and tokenizers. - Searcher can be constructed from Vec<Arc<dyn SegmentReader>> via Searcher::from_segment_readers(), without needing an Index. - Searcher::index() is deprecated in favor of Searcher::context(). Postings and DocSet changes: - Postings trait gains doc_freq() -> DocFreq (Exact/Approximate) and has_freq(). - RawPostingsData struct carries raw postings bytes across the trait boundary for custom reader implementations. - BlockSegmentPostings::open() takes OwnedBytes instead of FileSlice. - DocSet gains fill_bitset() method. Scorer improvements: - Scorer trait absorbs for_each, for_each_pruning, and explain (previously free functions or on Weight). - box_scorer() helper avoids double-boxing Box<dyn Scorer>. - BoxedTermScorer wraps a type-erased term scorer. - BufferedUnionScorer initialization fixed to avoid an extra advance() on construction. Other changes: - Document::to_json() now returns serde_json::Value; the old string serialization is renamed to to_serialized_json(). - DocumentDeserialize removed from the store reader public API.
471 lines
17 KiB
Rust
471 lines
17 KiB
Rust
use core::fmt::Debug;
|
|
|
|
use columnar::{ColumnIndex, DynamicColumn};
|
|
use common::BitSet;
|
|
|
|
use super::{box_scorer, ConstScorer, EmptyScorer};
|
|
use crate::docset::{DocSet, TERMINATED};
|
|
use crate::index::SegmentReader;
|
|
use crate::query::all_query::AllScorer;
|
|
use crate::query::boost_query::BoostScorer;
|
|
use crate::query::explanation::does_not_match;
|
|
use crate::query::{BitSetDocSet, EnableScoring, Explanation, Query, Scorer, Weight};
|
|
use crate::schema::Type;
|
|
use crate::{DocId, Score, TantivyError};
|
|
|
|
/// Query that matches all documents with a non-null value in the specified
|
|
/// field.
|
|
///
|
|
/// When querying inside a JSON field, "exists" queries can be executed strictly
|
|
/// on the field name or check all the subpaths. In that second case a document
|
|
/// will be matched if a non-null value exists in any subpath. For example,
|
|
/// assuming the following document where `myfield` is a JSON fast field:
|
|
/// ```json
|
|
/// {
|
|
/// "myfield": {
|
|
/// "mysubfield": "hello"
|
|
/// }
|
|
/// }
|
|
/// ```
|
|
/// With `json_subpaths` enabled queries on either `myfield` or
|
|
/// `myfield.mysubfield` will match the document. If it is set to false, only
|
|
/// `myfield.mysubfield` will match it.
|
|
///
|
|
/// All of the matched documents get the score 1.0.
|
|
#[derive(Clone, Debug)]
|
|
pub struct ExistsQuery {
|
|
field_name: String,
|
|
json_subpaths: bool,
|
|
}
|
|
|
|
impl ExistsQuery {
|
|
/// Creates a new `ExistQuery` from the given field.
|
|
///
|
|
/// This query matches all documents with at least one non-null value in the specified field.
|
|
/// This constructor never fails, but executing the search with this query will return an
|
|
/// error if the specified field doesn't exists or is not a fast field.
|
|
#[deprecated]
|
|
pub fn new_exists_query(field: String) -> ExistsQuery {
|
|
ExistsQuery {
|
|
field_name: field,
|
|
json_subpaths: false,
|
|
}
|
|
}
|
|
|
|
/// Creates a new `ExistQuery` from the given field.
|
|
///
|
|
/// This query matches all documents with at least one non-null value in the
|
|
/// specified field. If `json_subpaths` is set to true, documents with
|
|
/// non-null values in any JSON subpath will also be matched.
|
|
///
|
|
/// This constructor never fails, but executing the search with this query will
|
|
/// return an error if the specified field doesn't exists or is not a fast
|
|
/// field.
|
|
pub fn new(field: String, json_subpaths: bool) -> Self {
|
|
Self {
|
|
field_name: field,
|
|
json_subpaths,
|
|
}
|
|
}
|
|
}
|
|
|
|
impl Query for ExistsQuery {
|
|
fn weight(&self, enable_scoring: EnableScoring) -> crate::Result<Box<dyn Weight>> {
|
|
let schema = enable_scoring.schema();
|
|
let Some((field, _path)) = schema.find_field(&self.field_name) else {
|
|
return Err(TantivyError::FieldNotFound(self.field_name.clone()));
|
|
};
|
|
let field_type = schema.get_field_entry(field).field_type();
|
|
if !field_type.is_fast() {
|
|
return Err(TantivyError::SchemaError(format!(
|
|
"Field {} is not a fast field.",
|
|
self.field_name
|
|
)));
|
|
}
|
|
Ok(Box::new(ExistsWeight {
|
|
field_name: self.field_name.clone(),
|
|
field_type: field_type.value_type(),
|
|
json_subpaths: self.json_subpaths,
|
|
}))
|
|
}
|
|
}
|
|
|
|
/// Weight associated with the `ExistsQuery` query.
|
|
pub struct ExistsWeight {
|
|
field_name: String,
|
|
field_type: Type,
|
|
json_subpaths: bool,
|
|
}
|
|
|
|
impl Weight for ExistsWeight {
|
|
fn scorer(&self, reader: &dyn SegmentReader, boost: Score) -> crate::Result<Box<dyn Scorer>> {
|
|
let fast_field_reader = reader.fast_fields();
|
|
let mut column_handles = fast_field_reader.dynamic_column_handles(&self.field_name)?;
|
|
if self.field_type == Type::Json && self.json_subpaths {
|
|
let mut sub_columns =
|
|
fast_field_reader.dynamic_subpath_column_handles(&self.field_name)?;
|
|
column_handles.append(&mut sub_columns);
|
|
}
|
|
let dynamic_columns: crate::Result<Vec<DynamicColumn>> = column_handles
|
|
.into_iter()
|
|
.map(|handle| handle.open().map_err(|io_error| io_error.into()))
|
|
.collect();
|
|
let mut non_empty_columns = Vec::new();
|
|
for column in dynamic_columns? {
|
|
if !matches!(column.column_index(), ColumnIndex::Empty { .. }) {
|
|
non_empty_columns.push(column)
|
|
}
|
|
}
|
|
if non_empty_columns.is_empty() {
|
|
return Ok(box_scorer(EmptyScorer));
|
|
}
|
|
|
|
// If any column is full, all docs match.
|
|
let max_doc = reader.max_doc();
|
|
if non_empty_columns
|
|
.iter()
|
|
.any(|col| matches!(col.column_index(), ColumnIndex::Full))
|
|
{
|
|
let all_scorer = AllScorer::new(max_doc);
|
|
if boost != 1.0f32 {
|
|
return Ok(box_scorer(BoostScorer::new(all_scorer, boost)));
|
|
} else {
|
|
return Ok(box_scorer(all_scorer));
|
|
}
|
|
}
|
|
|
|
// If we have a single dynamic column, use ExistsDocSet
|
|
// NOTE: A lower number may be better for very sparse columns
|
|
if non_empty_columns.len() < 4 {
|
|
let docset = ExistsDocSet::new(non_empty_columns, reader.max_doc());
|
|
return Ok(box_scorer(ConstScorer::new(docset, boost)));
|
|
}
|
|
|
|
// If we have many dynamic columns, precompute a bitset of matching docs
|
|
let mut doc_bitset = BitSet::with_max_value(max_doc);
|
|
for column in &non_empty_columns {
|
|
match column.column_index() {
|
|
ColumnIndex::Empty { .. } => {}
|
|
ColumnIndex::Full => {
|
|
// Handled by AllScorer return above.
|
|
}
|
|
ColumnIndex::Optional(optional_index) => {
|
|
for doc in optional_index.iter_non_null_docs() {
|
|
doc_bitset.insert(doc);
|
|
}
|
|
}
|
|
ColumnIndex::Multivalued(multi_idx) => {
|
|
for doc in multi_idx.iter_non_null_docs() {
|
|
doc_bitset.insert(doc);
|
|
}
|
|
}
|
|
}
|
|
}
|
|
let docset = BitSetDocSet::from(doc_bitset);
|
|
Ok(box_scorer(ConstScorer::new(docset, boost)))
|
|
}
|
|
|
|
fn explain(&self, reader: &dyn SegmentReader, doc: DocId) -> crate::Result<Explanation> {
|
|
let mut scorer = self.scorer(reader, 1.0)?;
|
|
if scorer.seek(doc) != doc {
|
|
return Err(does_not_match(doc));
|
|
}
|
|
Ok(Explanation::new("ExistsQuery", 1.0))
|
|
}
|
|
}
|
|
|
|
pub(crate) struct ExistsDocSet {
|
|
columns: Vec<DynamicColumn>,
|
|
doc: DocId,
|
|
max_doc: DocId,
|
|
}
|
|
|
|
impl ExistsDocSet {
|
|
pub(crate) fn new(columns: Vec<DynamicColumn>, max_doc: DocId) -> Self {
|
|
let mut set = Self {
|
|
columns,
|
|
doc: 0u32,
|
|
max_doc,
|
|
};
|
|
set.find_next();
|
|
set
|
|
}
|
|
|
|
fn find_next(&mut self) -> DocId {
|
|
while self.doc < self.max_doc {
|
|
if self
|
|
.columns
|
|
.iter()
|
|
.any(|col| col.column_index().has_value(self.doc))
|
|
{
|
|
return self.doc;
|
|
}
|
|
self.doc += 1;
|
|
}
|
|
self.doc = TERMINATED;
|
|
TERMINATED
|
|
}
|
|
}
|
|
|
|
impl DocSet for ExistsDocSet {
|
|
fn advance(&mut self) -> DocId {
|
|
self.seek(self.doc + 1)
|
|
}
|
|
|
|
fn size_hint(&self) -> u32 {
|
|
0
|
|
}
|
|
|
|
fn doc(&self) -> DocId {
|
|
self.doc
|
|
}
|
|
|
|
#[inline(always)]
|
|
fn seek(&mut self, target: DocId) -> DocId {
|
|
self.doc = target;
|
|
self.find_next()
|
|
}
|
|
}
|
|
|
|
#[cfg(test)]
|
|
mod tests {
|
|
use std::net::Ipv6Addr;
|
|
use std::ops::Bound;
|
|
|
|
use common::DateTime;
|
|
use time::OffsetDateTime;
|
|
|
|
use crate::collector::Count;
|
|
use crate::query::exist_query::ExistsQuery;
|
|
use crate::query::{BooleanQuery, RangeQuery};
|
|
use crate::schema::{Facet, FacetOptions, Schema, FAST, INDEXED, STRING, TEXT};
|
|
use crate::{Index, Searcher, Term};
|
|
|
|
#[test]
|
|
fn test_exists_query_simple() -> crate::Result<()> {
|
|
let mut schema_builder = Schema::builder();
|
|
let all_field = schema_builder.add_u64_field("all", INDEXED | FAST);
|
|
let even_field = schema_builder.add_u64_field("even", INDEXED | FAST);
|
|
let odd_field = schema_builder.add_text_field("odd", STRING | FAST);
|
|
let multi_field = schema_builder.add_text_field("multi", FAST);
|
|
let _never_field = schema_builder.add_u64_field("never", INDEXED | FAST);
|
|
let schema = schema_builder.build();
|
|
|
|
let index = Index::create_in_ram(schema);
|
|
{
|
|
let mut index_writer = index.writer_for_tests()?;
|
|
for i in 0u64..100u64 {
|
|
if i % 2 == 0 {
|
|
if i % 10 == 0 {
|
|
index_writer.add_document(doc!(all_field => i, even_field => i, multi_field => i.to_string(), multi_field => (i + 1).to_string()))?;
|
|
} else {
|
|
index_writer.add_document(doc!(all_field => i, even_field => i))?;
|
|
}
|
|
} else {
|
|
index_writer.add_document(doc!(all_field => i, odd_field => i.to_string()))?;
|
|
}
|
|
}
|
|
index_writer.commit()?;
|
|
}
|
|
let reader = index.reader()?;
|
|
let searcher = reader.searcher();
|
|
|
|
assert_eq!(count_existing_fields(&searcher, "all", false)?, 100);
|
|
assert_eq!(count_existing_fields(&searcher, "odd", false)?, 50);
|
|
assert_eq!(count_existing_fields(&searcher, "even", false)?, 50);
|
|
assert_eq!(count_existing_fields(&searcher, "multi", false)?, 10);
|
|
assert_eq!(count_existing_fields(&searcher, "multi", true)?, 10);
|
|
assert_eq!(count_existing_fields(&searcher, "never", false)?, 0);
|
|
|
|
// exercise seek
|
|
let query = BooleanQuery::intersection(vec![
|
|
Box::new(RangeQuery::new(
|
|
Bound::Included(Term::from_field_u64(all_field, 50)),
|
|
Bound::Unbounded,
|
|
)),
|
|
Box::new(ExistsQuery::new("even".to_string(), false)),
|
|
]);
|
|
assert_eq!(searcher.search(&query, &Count)?, 25);
|
|
|
|
let query = BooleanQuery::intersection(vec![
|
|
Box::new(RangeQuery::new(
|
|
Bound::Included(Term::from_field_u64(all_field, 0)),
|
|
Bound::Included(Term::from_field_u64(all_field, 50)),
|
|
)),
|
|
Box::new(ExistsQuery::new("odd".to_string(), false)),
|
|
]);
|
|
assert_eq!(searcher.search(&query, &Count)?, 25);
|
|
|
|
Ok(())
|
|
}
|
|
|
|
#[test]
|
|
fn test_exists_query_json() -> crate::Result<()> {
|
|
let mut schema_builder = Schema::builder();
|
|
let json = schema_builder.add_json_field("json", TEXT | FAST);
|
|
let schema = schema_builder.build();
|
|
|
|
let index = Index::create_in_ram(schema);
|
|
{
|
|
let mut index_writer = index.writer_for_tests()?;
|
|
for i in 0u64..100u64 {
|
|
if i % 2 == 0 {
|
|
index_writer.add_document(doc!(json => json!({"all": i, "even": true})))?;
|
|
} else {
|
|
index_writer
|
|
.add_document(doc!(json => json!({"all": i.to_string(), "odd": true})))?;
|
|
}
|
|
}
|
|
index_writer.commit()?;
|
|
}
|
|
let reader = index.reader()?;
|
|
let searcher = reader.searcher();
|
|
|
|
assert_eq!(count_existing_fields(&searcher, "json.all", false)?, 100);
|
|
assert_eq!(count_existing_fields(&searcher, "json.even", false)?, 50);
|
|
assert_eq!(count_existing_fields(&searcher, "json.even", true)?, 50);
|
|
assert_eq!(count_existing_fields(&searcher, "json.odd", false)?, 50);
|
|
assert_eq!(count_existing_fields(&searcher, "json", false)?, 0);
|
|
assert_eq!(count_existing_fields(&searcher, "json", true)?, 100);
|
|
|
|
// Handling of non-existing fields:
|
|
assert_eq!(count_existing_fields(&searcher, "json.absent", false)?, 0);
|
|
assert_eq!(count_existing_fields(&searcher, "json.absent", true)?, 0);
|
|
assert_does_not_exist(&searcher, "does_not_exists.absent", true);
|
|
assert_does_not_exist(&searcher, "does_not_exists.absent", false);
|
|
|
|
Ok(())
|
|
}
|
|
|
|
#[test]
|
|
fn test_exists_query_json_union_no_single_full_subpath() -> crate::Result<()> {
|
|
// Build docs where no single subpath exists for all docs, but the union does.
|
|
let mut schema_builder = Schema::builder();
|
|
let json = schema_builder.add_json_field("json", TEXT | FAST);
|
|
let schema = schema_builder.build();
|
|
|
|
let index = Index::create_in_ram(schema);
|
|
{
|
|
let mut index_writer = index.writer_for_tests()?;
|
|
for i in 0u64..100u64 {
|
|
if i % 2 == 0 {
|
|
// only subpath `a`
|
|
index_writer.add_document(doc!(json => json!({"a": i})))?;
|
|
} else {
|
|
// only subpath `b`
|
|
index_writer.add_document(doc!(json => json!({"b": i})))?;
|
|
}
|
|
}
|
|
index_writer.commit()?;
|
|
}
|
|
let reader = index.reader()?;
|
|
let searcher = reader.searcher();
|
|
|
|
// No single subpath is full
|
|
assert_eq!(count_existing_fields(&searcher, "json.a", false)?, 50);
|
|
assert_eq!(count_existing_fields(&searcher, "json.b", false)?, 50);
|
|
|
|
// Root exists with subpaths disabled is zero
|
|
assert_eq!(count_existing_fields(&searcher, "json", false)?, 0);
|
|
|
|
// Root exists with subpaths enabled should match all docs via union
|
|
assert_eq!(count_existing_fields(&searcher, "json", true)?, 100);
|
|
|
|
Ok(())
|
|
}
|
|
|
|
#[test]
|
|
fn test_exists_query_misc_supported_types() -> crate::Result<()> {
|
|
let mut schema_builder = Schema::builder();
|
|
let bool = schema_builder.add_bool_field("bool", FAST);
|
|
let bytes = schema_builder.add_bytes_field("bytes", FAST);
|
|
let date = schema_builder.add_date_field("date", FAST);
|
|
let f64 = schema_builder.add_f64_field("f64", FAST);
|
|
let ip_addr = schema_builder.add_ip_addr_field("ip_addr", FAST);
|
|
let facet = schema_builder.add_facet_field("facet", FacetOptions::default());
|
|
let schema = schema_builder.build();
|
|
|
|
let index = Index::create_in_ram(schema);
|
|
{
|
|
let mut index_writer = index.writer_for_tests()?;
|
|
let now = OffsetDateTime::now_utc().unix_timestamp();
|
|
for i in 0u8..100u8 {
|
|
if i % 2 == 0 {
|
|
let date_val = DateTime::from_utc(OffsetDateTime::from_unix_timestamp(
|
|
now + i as i64 * 100,
|
|
)?);
|
|
index_writer.add_document(
|
|
doc!(bool => i % 3 == 0, bytes => vec![i, i + 1, i + 2], date => date_val),
|
|
)?;
|
|
} else {
|
|
let ip_addr_v6 = Ipv6Addr::new(0, 0, 0, 0, 0, 0xffff, 0xc00a, i.into());
|
|
index_writer
|
|
.add_document(doc!(f64 => i as f64 * 0.5, ip_addr => ip_addr_v6, facet => Facet::from("/facet/foo"), facet => Facet::from("/facet/bar")))?;
|
|
}
|
|
}
|
|
index_writer.commit()?;
|
|
}
|
|
let reader = index.reader()?;
|
|
let searcher = reader.searcher();
|
|
|
|
assert_eq!(count_existing_fields(&searcher, "bool", false)?, 50);
|
|
assert_eq!(count_existing_fields(&searcher, "bool", true)?, 50);
|
|
assert_eq!(count_existing_fields(&searcher, "bytes", false)?, 50);
|
|
assert_eq!(count_existing_fields(&searcher, "date", false)?, 50);
|
|
assert_eq!(count_existing_fields(&searcher, "f64", false)?, 50);
|
|
assert_eq!(count_existing_fields(&searcher, "ip_addr", false)?, 50);
|
|
assert_eq!(count_existing_fields(&searcher, "facet", false)?, 50);
|
|
|
|
Ok(())
|
|
}
|
|
|
|
#[test]
|
|
fn test_exists_query_unsupported_types() -> crate::Result<()> {
|
|
let mut schema_builder = Schema::builder();
|
|
let not_fast = schema_builder.add_text_field("not_fast", TEXT);
|
|
let schema = schema_builder.build();
|
|
|
|
let index = Index::create_in_ram(schema);
|
|
{
|
|
let mut index_writer = index.writer_for_tests()?;
|
|
index_writer.add_document(doc!(
|
|
not_fast => "slow",
|
|
))?;
|
|
index_writer.commit()?;
|
|
}
|
|
let reader = index.reader()?;
|
|
let searcher = reader.searcher();
|
|
|
|
assert_eq!(
|
|
searcher
|
|
.search(&ExistsQuery::new("not_fast".to_string(), false), &Count)
|
|
.unwrap_err()
|
|
.to_string(),
|
|
"Schema error: 'Field not_fast is not a fast field.'"
|
|
);
|
|
|
|
assert_does_not_exist(&searcher, "does_not_exists", false);
|
|
|
|
Ok(())
|
|
}
|
|
|
|
fn count_existing_fields(
|
|
searcher: &Searcher,
|
|
field: &str,
|
|
json_subpaths: bool,
|
|
) -> crate::Result<usize> {
|
|
let query = ExistsQuery::new(field.to_string(), json_subpaths);
|
|
searcher.search(&query, &Count)
|
|
}
|
|
|
|
fn assert_does_not_exist(searcher: &Searcher, field: &str, json_subpaths: bool) {
|
|
assert_eq!(
|
|
searcher
|
|
.search(&ExistsQuery::new(field.to_string(), json_subpaths), &Count)
|
|
.unwrap_err()
|
|
.to_string(),
|
|
format!("The field does not exist: '{field}'")
|
|
);
|
|
}
|
|
}
|