mirror of
https://github.com/quickwit-oss/tantivy.git
synced 2026-01-06 09:12:55 +00:00
Compare commits
1 Commits
bucket_id_
...
qw-airmail
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
eb5f51f3c4 |
@@ -167,3 +167,7 @@ harness = false
|
||||
[[bench]]
|
||||
name = "agg_bench"
|
||||
harness = false
|
||||
|
||||
[[bench]]
|
||||
name = "exists_json"
|
||||
harness = false
|
||||
|
||||
69
benches/exists_json.rs
Normal file
69
benches/exists_json.rs
Normal file
@@ -0,0 +1,69 @@
|
||||
use binggan::plugins::PeakMemAllocPlugin;
|
||||
use binggan::{black_box, InputGroup, PeakMemAlloc, INSTRUMENTED_SYSTEM};
|
||||
use serde_json::json;
|
||||
use tantivy::collector::Count;
|
||||
use tantivy::query::ExistsQuery;
|
||||
use tantivy::schema::{Schema, FAST, TEXT};
|
||||
use tantivy::{doc, Index};
|
||||
|
||||
#[global_allocator]
|
||||
pub static GLOBAL: &PeakMemAlloc<std::alloc::System> = &INSTRUMENTED_SYSTEM;
|
||||
|
||||
fn main() {
|
||||
let doc_count: usize = 500_000;
|
||||
let subfield_counts: &[usize] = &[1, 2, 3, 4, 5, 6, 7, 8, 16, 256, 4096, 65536, 262144];
|
||||
|
||||
let indices: Vec<(String, Index)> = subfield_counts
|
||||
.iter()
|
||||
.map(|&sub_fields| {
|
||||
(
|
||||
format!("subfields={sub_fields}"),
|
||||
build_index_with_json_subfields(doc_count, sub_fields),
|
||||
)
|
||||
})
|
||||
.collect();
|
||||
|
||||
let mut group = InputGroup::new_with_inputs(indices);
|
||||
group.add_plugin(PeakMemAllocPlugin::new(GLOBAL));
|
||||
|
||||
group.config().num_iter_group = Some(1);
|
||||
group.config().num_iter_bench = Some(1);
|
||||
group.register("exists_json", exists_json_union);
|
||||
|
||||
group.run();
|
||||
}
|
||||
|
||||
fn exists_json_union(index: &Index) {
|
||||
let reader = index.reader().expect("reader");
|
||||
let searcher = reader.searcher();
|
||||
let query = ExistsQuery::new("json".to_string(), true);
|
||||
let count = searcher.search(&query, &Count).expect("exists search");
|
||||
// Prevents optimizer from eliding the search
|
||||
black_box(count);
|
||||
}
|
||||
|
||||
fn build_index_with_json_subfields(num_docs: usize, num_subfields: usize) -> Index {
|
||||
// Schema: single JSON field stored as FAST to support ExistsQuery.
|
||||
let mut schema_builder = Schema::builder();
|
||||
let json_field = schema_builder.add_json_field("json", TEXT | FAST);
|
||||
let schema = schema_builder.build();
|
||||
|
||||
let index = Index::create_from_tempdir(schema).expect("create index");
|
||||
{
|
||||
let mut index_writer = index
|
||||
.writer_with_num_threads(1, 200_000_000)
|
||||
.expect("writer");
|
||||
for i in 0..num_docs {
|
||||
let sub = i % num_subfields;
|
||||
// Only one subpath set per document; rotate subpaths so that
|
||||
// no single subpath is full, but the union covers all docs.
|
||||
let v = json!({ format!("field_{sub}"): i as u64 });
|
||||
index_writer
|
||||
.add_document(doc!(json_field => v))
|
||||
.expect("add_document");
|
||||
}
|
||||
index_writer.commit().expect("commit");
|
||||
}
|
||||
|
||||
index
|
||||
}
|
||||
@@ -56,7 +56,7 @@ fn get_doc_ids_with_values<'a>(
|
||||
ColumnIndex::Full => Box::new(doc_range),
|
||||
ColumnIndex::Optional(optional_index) => Box::new(
|
||||
optional_index
|
||||
.iter_docs()
|
||||
.iter_non_null_docs()
|
||||
.map(move |row| row + doc_range.start),
|
||||
),
|
||||
ColumnIndex::Multivalued(multivalued_index) => match multivalued_index {
|
||||
@@ -73,7 +73,7 @@ fn get_doc_ids_with_values<'a>(
|
||||
MultiValueIndex::MultiValueIndexV2(multivalued_index) => Box::new(
|
||||
multivalued_index
|
||||
.optional_index
|
||||
.iter_docs()
|
||||
.iter_non_null_docs()
|
||||
.map(move |row| row + doc_range.start),
|
||||
),
|
||||
},
|
||||
@@ -177,7 +177,7 @@ impl<'a> Iterable<RowId> for StackedOptionalIndex<'a> {
|
||||
ColumnIndex::Full => Box::new(columnar_row_range),
|
||||
ColumnIndex::Optional(optional_index) => Box::new(
|
||||
optional_index
|
||||
.iter_docs()
|
||||
.iter_non_null_docs()
|
||||
.map(move |row_id: RowId| columnar_row_range.start + row_id),
|
||||
),
|
||||
ColumnIndex::Multivalued(_) => {
|
||||
|
||||
@@ -215,6 +215,32 @@ impl MultiValueIndex {
|
||||
}
|
||||
}
|
||||
|
||||
/// Returns an iterator over document ids that have at least one value.
|
||||
pub fn iter_non_null_docs(&self) -> Box<dyn Iterator<Item = DocId> + '_> {
|
||||
match self {
|
||||
MultiValueIndex::MultiValueIndexV1(idx) => {
|
||||
let mut doc: DocId = 0u32;
|
||||
let num_docs = idx.num_docs();
|
||||
Box::new(std::iter::from_fn(move || {
|
||||
// This is not the most efficient way to do this, but it's legacy code.
|
||||
while doc < num_docs {
|
||||
let cur = doc;
|
||||
doc += 1;
|
||||
let start = idx.start_index_column.get_val(cur);
|
||||
let end = idx.start_index_column.get_val(cur + 1);
|
||||
if end > start {
|
||||
return Some(cur);
|
||||
}
|
||||
}
|
||||
None
|
||||
}))
|
||||
}
|
||||
MultiValueIndex::MultiValueIndexV2(idx) => {
|
||||
Box::new(idx.optional_index.iter_non_null_docs())
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Converts a list of ranks (row ids of values) in a 1:n index to the corresponding list of
|
||||
/// docids. Positions are converted inplace to docids.
|
||||
///
|
||||
|
||||
@@ -88,7 +88,7 @@ pub struct OptionalIndex {
|
||||
|
||||
impl Iterable<u32> for &OptionalIndex {
|
||||
fn boxed_iter(&self) -> Box<dyn Iterator<Item = u32> + '_> {
|
||||
Box::new(self.iter_docs())
|
||||
Box::new(self.iter_non_null_docs())
|
||||
}
|
||||
}
|
||||
|
||||
@@ -280,8 +280,9 @@ impl OptionalIndex {
|
||||
self.num_non_null_docs
|
||||
}
|
||||
|
||||
pub fn iter_docs(&self) -> impl Iterator<Item = RowId> + '_ {
|
||||
// TODO optimize
|
||||
pub fn iter_non_null_docs(&self) -> impl Iterator<Item = RowId> + '_ {
|
||||
// TODO optimize. We could iterate over the blocks directly.
|
||||
// We use the dense value ids and retrieve the doc ids via select.
|
||||
let mut select_batch = self.select_cursor();
|
||||
(0..self.num_non_null_docs).map(move |rank| select_batch.select(rank))
|
||||
}
|
||||
|
||||
@@ -164,7 +164,11 @@ fn test_optional_index_large() {
|
||||
fn test_optional_index_iter_aux(row_ids: &[RowId], num_rows: RowId) {
|
||||
let optional_index = OptionalIndex::for_test(num_rows, row_ids);
|
||||
assert_eq!(optional_index.num_docs(), num_rows);
|
||||
assert!(optional_index.iter_docs().eq(row_ids.iter().copied()));
|
||||
assert!(
|
||||
optional_index
|
||||
.iter_non_null_docs()
|
||||
.eq(row_ids.iter().copied())
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
|
||||
@@ -367,7 +367,7 @@ fn is_empty_after_merge(
|
||||
ColumnIndex::Empty { .. } => true,
|
||||
ColumnIndex::Full => alive_bitset.len() == 0,
|
||||
ColumnIndex::Optional(optional_index) => {
|
||||
for doc in optional_index.iter_docs() {
|
||||
for doc in optional_index.iter_non_null_docs() {
|
||||
if alive_bitset.contains(doc) {
|
||||
return false;
|
||||
}
|
||||
|
||||
@@ -1,12 +1,15 @@
|
||||
use core::fmt::Debug;
|
||||
|
||||
use columnar::{ColumnIndex, DynamicColumn};
|
||||
use common::BitSet;
|
||||
|
||||
use super::{ConstScorer, EmptyScorer};
|
||||
use crate::docset::{DocSet, TERMINATED};
|
||||
use crate::index::SegmentReader;
|
||||
use crate::query::all_query::AllScorer;
|
||||
use crate::query::boost_query::BoostScorer;
|
||||
use crate::query::explanation::does_not_match;
|
||||
use crate::query::{EnableScoring, Explanation, Query, Scorer, Weight};
|
||||
use crate::query::{BitSetDocSet, EnableScoring, Explanation, Query, Scorer, Weight};
|
||||
use crate::schema::Type;
|
||||
use crate::{DocId, Score, TantivyError};
|
||||
|
||||
@@ -113,13 +116,49 @@ impl Weight for ExistsWeight {
|
||||
non_empty_columns.push(column)
|
||||
}
|
||||
}
|
||||
// TODO: we can optimizer more here since in most cases we will have only one index
|
||||
if !non_empty_columns.is_empty() {
|
||||
let docset = ExistsDocSet::new(non_empty_columns, reader.max_doc());
|
||||
Ok(Box::new(ConstScorer::new(docset, boost)))
|
||||
} else {
|
||||
Ok(Box::new(EmptyScorer))
|
||||
if non_empty_columns.is_empty() {
|
||||
return Ok(Box::new(EmptyScorer));
|
||||
}
|
||||
|
||||
// If any column is full, all docs match.
|
||||
let max_doc = reader.max_doc();
|
||||
if non_empty_columns
|
||||
.iter()
|
||||
.any(|col| matches!(col.column_index(), ColumnIndex::Full))
|
||||
{
|
||||
let all_scorer = AllScorer::new(max_doc);
|
||||
return Ok(Box::new(BoostScorer::new(all_scorer, boost)));
|
||||
}
|
||||
|
||||
// If we have a single dynamic column, use ExistsDocSet
|
||||
// NOTE: A lower number may be better for very sparse columns
|
||||
if non_empty_columns.len() < 4 {
|
||||
let docset = ExistsDocSet::new(non_empty_columns, reader.max_doc());
|
||||
return Ok(Box::new(ConstScorer::new(docset, boost)));
|
||||
}
|
||||
|
||||
// If we have many dynamic columns, precompute a bitset of matching docs
|
||||
let mut doc_bitset = BitSet::with_max_value(max_doc);
|
||||
for column in &non_empty_columns {
|
||||
match column.column_index() {
|
||||
ColumnIndex::Empty { .. } => {}
|
||||
ColumnIndex::Full => {
|
||||
// Handled by AllScorer return above.
|
||||
}
|
||||
ColumnIndex::Optional(optional_index) => {
|
||||
for doc in optional_index.iter_non_null_docs() {
|
||||
doc_bitset.insert(doc);
|
||||
}
|
||||
}
|
||||
ColumnIndex::Multivalued(multi_idx) => {
|
||||
for doc in multi_idx.iter_non_null_docs() {
|
||||
doc_bitset.insert(doc);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
let docset = BitSetDocSet::from(doc_bitset);
|
||||
Ok(Box::new(ConstScorer::new(docset, boost)))
|
||||
}
|
||||
|
||||
fn explain(&self, reader: &SegmentReader, doc: DocId) -> crate::Result<Explanation> {
|
||||
@@ -294,6 +333,43 @@ mod tests {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_exists_query_json_union_no_single_full_subpath() -> crate::Result<()> {
|
||||
// Build docs where no single subpath exists for all docs, but the union does.
|
||||
let mut schema_builder = Schema::builder();
|
||||
let json = schema_builder.add_json_field("json", TEXT | FAST);
|
||||
let schema = schema_builder.build();
|
||||
|
||||
let index = Index::create_in_ram(schema);
|
||||
{
|
||||
let mut index_writer = index.writer_for_tests()?;
|
||||
for i in 0u64..100u64 {
|
||||
if i % 2 == 0 {
|
||||
// only subpath `a`
|
||||
index_writer.add_document(doc!(json => json!({"a": i})))?;
|
||||
} else {
|
||||
// only subpath `b`
|
||||
index_writer.add_document(doc!(json => json!({"b": i})))?;
|
||||
}
|
||||
}
|
||||
index_writer.commit()?;
|
||||
}
|
||||
let reader = index.reader()?;
|
||||
let searcher = reader.searcher();
|
||||
|
||||
// No single subpath is full
|
||||
assert_eq!(count_existing_fields(&searcher, "json.a", false)?, 50);
|
||||
assert_eq!(count_existing_fields(&searcher, "json.b", false)?, 50);
|
||||
|
||||
// Root exists with subpaths disabled is zero
|
||||
assert_eq!(count_existing_fields(&searcher, "json", false)?, 0);
|
||||
|
||||
// Root exists with subpaths enabled should match all docs via union
|
||||
assert_eq!(count_existing_fields(&searcher, "json", true)?, 100);
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_exists_query_misc_supported_types() -> crate::Result<()> {
|
||||
let mut schema_builder = Schema::builder();
|
||||
|
||||
Reference in New Issue
Block a user