mirror of
https://github.com/quickwit-oss/tantivy.git
synced 2026-01-04 16:22:55 +00:00
Compare commits
15 Commits
unit-test-
...
paul.masur
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
ebb82dc549 | ||
|
|
270ca5123c | ||
|
|
714366d3b9 | ||
|
|
40659d4d07 | ||
|
|
e1e131a804 | ||
|
|
70da310b2d | ||
|
|
85010b589a | ||
|
|
2340dca628 | ||
|
|
71a26d5b24 | ||
|
|
203751f2fe | ||
|
|
7963b0b4aa | ||
|
|
d5eefca11d | ||
|
|
5d6c8de23e | ||
|
|
a06365f39f | ||
|
|
f4b374110f |
20
CHANGELOG.md
20
CHANGELOG.md
@@ -14,6 +14,18 @@ Tantivy 0.25
|
||||
- Support mixed field types in query parser [#2676](https://github.com/quickwit-oss/tantivy/pull/2676)(@trinity-1686a)
|
||||
- Add per-field size details [#2679](https://github.com/quickwit-oss/tantivy/pull/2679)(@fulmicoton)
|
||||
|
||||
Tantivy 0.24.2
|
||||
================================
|
||||
- Fix TopNComputer for reverse order. [#2672](https://github.com/quickwit-oss/tantivy/pull/2672)(@stuhood @PSeitz)
|
||||
|
||||
Affected queries are [order_by_fast_field](https://docs.rs/tantivy/latest/tantivy/collector/struct.TopDocs.html#method.order_by_fast_field) and
|
||||
[order_by_u64_field](https://docs.rs/tantivy/latest/tantivy/collector/struct.TopDocs.html#method.order_by_u64_field)
|
||||
for `Order::Asc`
|
||||
|
||||
Tantivy 0.24.1
|
||||
================================
|
||||
- Fix: bump required rust version to 1.81
|
||||
|
||||
Tantivy 0.24
|
||||
================================
|
||||
Tantivy 0.24 will be backwards compatible with indices created with v0.22 and v0.21. The new minimum rust version will be 1.75. Tantivy 0.23 will be skipped.
|
||||
@@ -96,6 +108,14 @@ This will slightly increase space and access time. [#2439](https://github.com/qu
|
||||
- Fix trait bound of StoreReader::iter [#2360](https://github.com/quickwit-oss/tantivy/pull/2360)(@adamreichold)
|
||||
- remove read_postings_no_deletes [#2526](https://github.com/quickwit-oss/tantivy/pull/2526)(@PSeitz)
|
||||
|
||||
Tantivy 0.22.1
|
||||
================================
|
||||
- Fix TopNComputer for reverse order. [#2672](https://github.com/quickwit-oss/tantivy/pull/2672)(@stuhood @PSeitz)
|
||||
|
||||
Affected queries are [order_by_fast_field](https://docs.rs/tantivy/latest/tantivy/collector/struct.TopDocs.html#method.order_by_fast_field) and
|
||||
[order_by_u64_field](https://docs.rs/tantivy/latest/tantivy/collector/struct.TopDocs.html#method.order_by_u64_field)
|
||||
for `Order::Asc`
|
||||
|
||||
Tantivy 0.22
|
||||
================================
|
||||
|
||||
|
||||
@@ -167,3 +167,12 @@ harness = false
|
||||
[[bench]]
|
||||
name = "agg_bench"
|
||||
harness = false
|
||||
|
||||
[[bench]]
|
||||
name = "exists_json"
|
||||
harness = false
|
||||
|
||||
[[bench]]
|
||||
name = "and_or_queries"
|
||||
harness = false
|
||||
|
||||
|
||||
224
benches/and_or_queries.rs
Normal file
224
benches/and_or_queries.rs
Normal file
@@ -0,0 +1,224 @@
|
||||
// Benchmarks boolean conjunction queries using binggan.
|
||||
//
|
||||
// What’s measured:
|
||||
// - Or and And queries with varying selectivity (only `Term` queries for now on leafs)
|
||||
// - Nested AND/OR combinations (on multiple fields)
|
||||
// - No-scoring path using the Count collector (focus on iterator/skip performance)
|
||||
// - Top-K retrieval (k=10) using the TopDocs collector
|
||||
//
|
||||
// Corpus model:
|
||||
// - Synthetic docs; each token a/b/c is independently included per doc
|
||||
// - If none of a/b/c are included, emit a neutral filler token to keep doc length similar
|
||||
//
|
||||
// Notes:
|
||||
// - After optimization, when scoring is disabled Tantivy reads doc-only postings
|
||||
// (IndexRecordOption::Basic), avoiding frequency decoding overhead.
|
||||
// - This bench isolates boolean iteration speed and intersection/union cost.
|
||||
// - Use `cargo bench --bench boolean_conjunction` to run.
|
||||
|
||||
use binggan::{black_box, BenchRunner};
|
||||
use rand::prelude::*;
|
||||
use rand::rngs::StdRng;
|
||||
use rand::SeedableRng;
|
||||
use tantivy::collector::{Count, TopDocs};
|
||||
use tantivy::query::QueryParser;
|
||||
use tantivy::schema::{Schema, TEXT};
|
||||
use tantivy::{doc, Index, ReloadPolicy, Searcher};
|
||||
|
||||
#[derive(Clone)]
|
||||
struct BenchIndex {
|
||||
#[allow(dead_code)]
|
||||
index: Index,
|
||||
searcher: Searcher,
|
||||
query_parser: QueryParser,
|
||||
}
|
||||
|
||||
impl BenchIndex {
|
||||
#[inline(always)]
|
||||
fn count_query(&self, query_str: &str) -> usize {
|
||||
let query = self.query_parser.parse_query(query_str).unwrap();
|
||||
self.searcher.search(&query, &Count).unwrap()
|
||||
}
|
||||
|
||||
#[inline(always)]
|
||||
fn topk_len(&self, query_str: &str, k: usize) -> usize {
|
||||
let query = self.query_parser.parse_query(query_str).unwrap();
|
||||
self.searcher
|
||||
.search(&query, &TopDocs::with_limit(k))
|
||||
.unwrap()
|
||||
.len()
|
||||
}
|
||||
}
|
||||
|
||||
/// Build a single index containing both fields (title, body) and
|
||||
/// return two BenchIndex views:
|
||||
/// - single_field: QueryParser defaults to only "body"
|
||||
/// - multi_field: QueryParser defaults to ["title", "body"]
|
||||
fn build_shared_indices(num_docs: usize, p_a: f32, p_b: f32, p_c: f32) -> (BenchIndex, BenchIndex) {
|
||||
// Unified schema (two text fields)
|
||||
let mut schema_builder = Schema::builder();
|
||||
let f_title = schema_builder.add_text_field("title", TEXT);
|
||||
let f_body = schema_builder.add_text_field("body", TEXT);
|
||||
let schema = schema_builder.build();
|
||||
let index = Index::create_in_ram(schema.clone());
|
||||
|
||||
// Populate index with stable RNG for reproducibility.
|
||||
let mut rng = StdRng::from_seed([7u8; 32]);
|
||||
|
||||
// Populate: spread each present token 90/10 to body/title
|
||||
{
|
||||
let mut writer = index.writer(500_000_000).unwrap();
|
||||
for _ in 0..num_docs {
|
||||
let has_a = rng.gen_bool(p_a as f64);
|
||||
let has_b = rng.gen_bool(p_b as f64);
|
||||
let has_c = rng.gen_bool(p_c as f64);
|
||||
let mut title_tokens: Vec<&str> = Vec::new();
|
||||
let mut body_tokens: Vec<&str> = Vec::new();
|
||||
if has_a {
|
||||
if rng.gen_bool(0.1) {
|
||||
title_tokens.push("a");
|
||||
} else {
|
||||
body_tokens.push("a");
|
||||
}
|
||||
}
|
||||
if has_b {
|
||||
if rng.gen_bool(0.1) {
|
||||
title_tokens.push("b");
|
||||
} else {
|
||||
body_tokens.push("b");
|
||||
}
|
||||
}
|
||||
if has_c {
|
||||
if rng.gen_bool(0.1) {
|
||||
title_tokens.push("c");
|
||||
} else {
|
||||
body_tokens.push("c");
|
||||
}
|
||||
}
|
||||
if title_tokens.is_empty() && body_tokens.is_empty() {
|
||||
body_tokens.push("z");
|
||||
}
|
||||
writer
|
||||
.add_document(doc!(
|
||||
f_title=>title_tokens.join(" "),
|
||||
f_body=>body_tokens.join(" ")
|
||||
))
|
||||
.unwrap();
|
||||
}
|
||||
writer.commit().unwrap();
|
||||
}
|
||||
|
||||
// Prepare reader/searcher once.
|
||||
let reader = index
|
||||
.reader_builder()
|
||||
.reload_policy(ReloadPolicy::Manual)
|
||||
.try_into()
|
||||
.unwrap();
|
||||
let searcher = reader.searcher();
|
||||
|
||||
// Build two query parsers with different default fields.
|
||||
let qp_single = QueryParser::for_index(&index, vec![f_body]);
|
||||
let qp_multi = QueryParser::for_index(&index, vec![f_title, f_body]);
|
||||
|
||||
let single_view = BenchIndex {
|
||||
index: index.clone(),
|
||||
searcher: searcher.clone(),
|
||||
query_parser: qp_single,
|
||||
};
|
||||
let multi_view = BenchIndex {
|
||||
index,
|
||||
searcher,
|
||||
query_parser: qp_multi,
|
||||
};
|
||||
(single_view, multi_view)
|
||||
}
|
||||
|
||||
fn main() {
|
||||
// Prepare corpora with varying selectivity. Build one index per corpus
|
||||
// and derive two views (single-field vs multi-field) from it.
|
||||
let scenarios = vec![
|
||||
(
|
||||
"N=1M, p(a)=5%, p(b)=1%, p(c)=15%".to_string(),
|
||||
1_000_000,
|
||||
0.05,
|
||||
0.01,
|
||||
0.15,
|
||||
),
|
||||
(
|
||||
"N=1M, p(a)=1%, p(b)=1%, p(c)=15%".to_string(),
|
||||
1_000_000,
|
||||
0.01,
|
||||
0.01,
|
||||
0.15,
|
||||
),
|
||||
];
|
||||
|
||||
let mut runner = BenchRunner::new();
|
||||
for (label, n, pa, pb, pc) in scenarios {
|
||||
let (single_view, multi_view) = build_shared_indices(n, pa, pb, pc);
|
||||
|
||||
// Single-field group: default field is body only
|
||||
{
|
||||
let mut group = runner.new_group();
|
||||
group.set_name(format!("single_field — {}", label));
|
||||
group.register_with_input("+a_+b_count", &single_view, |benv: &BenchIndex| {
|
||||
black_box(benv.count_query("+a +b"))
|
||||
});
|
||||
group.register_with_input("+a_+b_+c_count", &single_view, |benv: &BenchIndex| {
|
||||
black_box(benv.count_query("+a +b +c"))
|
||||
});
|
||||
group.register_with_input("+a_+b_top10", &single_view, |benv: &BenchIndex| {
|
||||
black_box(benv.topk_len("+a +b", 10))
|
||||
});
|
||||
group.register_with_input("+a_+b_+c_top10", &single_view, |benv: &BenchIndex| {
|
||||
black_box(benv.topk_len("+a +b +c", 10))
|
||||
});
|
||||
// OR queries
|
||||
group.register_with_input("a_OR_b_count", &single_view, |benv: &BenchIndex| {
|
||||
black_box(benv.count_query("a OR b"))
|
||||
});
|
||||
group.register_with_input("a_OR_b_OR_c_count", &single_view, |benv: &BenchIndex| {
|
||||
black_box(benv.count_query("a OR b OR c"))
|
||||
});
|
||||
group.register_with_input("a_OR_b_top10", &single_view, |benv: &BenchIndex| {
|
||||
black_box(benv.topk_len("a OR b", 10))
|
||||
});
|
||||
group.register_with_input("a_OR_b_OR_c_top10", &single_view, |benv: &BenchIndex| {
|
||||
black_box(benv.topk_len("a OR b OR c", 10))
|
||||
});
|
||||
group.run();
|
||||
}
|
||||
|
||||
// Multi-field group: default fields are [title, body]
|
||||
{
|
||||
let mut group = runner.new_group();
|
||||
group.set_name(format!("multi_field — {}", label));
|
||||
group.register_with_input("+a_+b_count", &multi_view, |benv: &BenchIndex| {
|
||||
black_box(benv.count_query("+a +b"))
|
||||
});
|
||||
group.register_with_input("+a_+b_+c_count", &multi_view, |benv: &BenchIndex| {
|
||||
black_box(benv.count_query("+a +b +c"))
|
||||
});
|
||||
group.register_with_input("+a_+b_top10", &multi_view, |benv: &BenchIndex| {
|
||||
black_box(benv.topk_len("+a +b", 10))
|
||||
});
|
||||
group.register_with_input("+a_+b_+c_top10", &multi_view, |benv: &BenchIndex| {
|
||||
black_box(benv.topk_len("+a +b +c", 10))
|
||||
});
|
||||
// OR queries
|
||||
group.register_with_input("a_OR_b_count", &multi_view, |benv: &BenchIndex| {
|
||||
black_box(benv.count_query("a OR b"))
|
||||
});
|
||||
group.register_with_input("a_OR_b_OR_c_count", &multi_view, |benv: &BenchIndex| {
|
||||
black_box(benv.count_query("a OR b OR c"))
|
||||
});
|
||||
group.register_with_input("a_OR_b_top10", &multi_view, |benv: &BenchIndex| {
|
||||
black_box(benv.topk_len("a OR b", 10))
|
||||
});
|
||||
group.register_with_input("a_OR_b_OR_c_top10", &multi_view, |benv: &BenchIndex| {
|
||||
black_box(benv.topk_len("a OR b OR c", 10))
|
||||
});
|
||||
group.run();
|
||||
}
|
||||
}
|
||||
}
|
||||
69
benches/exists_json.rs
Normal file
69
benches/exists_json.rs
Normal file
@@ -0,0 +1,69 @@
|
||||
use binggan::plugins::PeakMemAllocPlugin;
|
||||
use binggan::{black_box, InputGroup, PeakMemAlloc, INSTRUMENTED_SYSTEM};
|
||||
use serde_json::json;
|
||||
use tantivy::collector::Count;
|
||||
use tantivy::query::ExistsQuery;
|
||||
use tantivy::schema::{Schema, FAST, TEXT};
|
||||
use tantivy::{doc, Index};
|
||||
|
||||
#[global_allocator]
|
||||
pub static GLOBAL: &PeakMemAlloc<std::alloc::System> = &INSTRUMENTED_SYSTEM;
|
||||
|
||||
fn main() {
|
||||
let doc_count: usize = 500_000;
|
||||
let subfield_counts: &[usize] = &[1, 2, 3, 4, 5, 6, 7, 8, 16, 256, 4096, 65536, 262144];
|
||||
|
||||
let indices: Vec<(String, Index)> = subfield_counts
|
||||
.iter()
|
||||
.map(|&sub_fields| {
|
||||
(
|
||||
format!("subfields={sub_fields}"),
|
||||
build_index_with_json_subfields(doc_count, sub_fields),
|
||||
)
|
||||
})
|
||||
.collect();
|
||||
|
||||
let mut group = InputGroup::new_with_inputs(indices);
|
||||
group.add_plugin(PeakMemAllocPlugin::new(GLOBAL));
|
||||
|
||||
group.config().num_iter_group = Some(1);
|
||||
group.config().num_iter_bench = Some(1);
|
||||
group.register("exists_json", exists_json_union);
|
||||
|
||||
group.run();
|
||||
}
|
||||
|
||||
fn exists_json_union(index: &Index) {
|
||||
let reader = index.reader().expect("reader");
|
||||
let searcher = reader.searcher();
|
||||
let query = ExistsQuery::new("json".to_string(), true);
|
||||
let count = searcher.search(&query, &Count).expect("exists search");
|
||||
// Prevents optimizer from eliding the search
|
||||
black_box(count);
|
||||
}
|
||||
|
||||
fn build_index_with_json_subfields(num_docs: usize, num_subfields: usize) -> Index {
|
||||
// Schema: single JSON field stored as FAST to support ExistsQuery.
|
||||
let mut schema_builder = Schema::builder();
|
||||
let json_field = schema_builder.add_json_field("json", TEXT | FAST);
|
||||
let schema = schema_builder.build();
|
||||
|
||||
let index = Index::create_from_tempdir(schema).expect("create index");
|
||||
{
|
||||
let mut index_writer = index
|
||||
.writer_with_num_threads(1, 200_000_000)
|
||||
.expect("writer");
|
||||
for i in 0..num_docs {
|
||||
let sub = i % num_subfields;
|
||||
// Only one subpath set per document; rotate subpaths so that
|
||||
// no single subpath is full, but the union covers all docs.
|
||||
let v = json!({ format!("field_{sub}"): i as u64 });
|
||||
index_writer
|
||||
.add_document(doc!(json_field => v))
|
||||
.expect("add_document");
|
||||
}
|
||||
index_writer.commit().expect("commit");
|
||||
}
|
||||
|
||||
index
|
||||
}
|
||||
@@ -48,7 +48,7 @@ impl BitPacker {
|
||||
|
||||
pub fn flush<TWrite: io::Write + ?Sized>(&mut self, output: &mut TWrite) -> io::Result<()> {
|
||||
if self.mini_buffer_written > 0 {
|
||||
let num_bytes = (self.mini_buffer_written + 7) / 8;
|
||||
let num_bytes = self.mini_buffer_written.div_ceil(8);
|
||||
let bytes = self.mini_buffer.to_le_bytes();
|
||||
output.write_all(&bytes[..num_bytes])?;
|
||||
self.mini_buffer_written = 0;
|
||||
@@ -138,7 +138,7 @@ impl BitUnpacker {
|
||||
|
||||
// We use `usize` here to avoid overflow issues.
|
||||
let end_bit_read = (end_idx as usize) * self.num_bits;
|
||||
let end_byte_read = (end_bit_read + 7) / 8;
|
||||
let end_byte_read = end_bit_read.div_ceil(8);
|
||||
assert!(
|
||||
end_byte_read <= data.len(),
|
||||
"Requested index is out of bounds."
|
||||
|
||||
@@ -140,10 +140,10 @@ impl BlockedBitpacker {
|
||||
pub fn iter(&self) -> impl Iterator<Item = u64> + '_ {
|
||||
// todo performance: we could decompress a whole block and cache it instead
|
||||
let bitpacked_elems = self.offset_and_bits.len() * BLOCK_SIZE;
|
||||
let iter = (0..bitpacked_elems)
|
||||
|
||||
(0..bitpacked_elems)
|
||||
.map(move |idx| self.get(idx))
|
||||
.chain(self.buffer.iter().cloned());
|
||||
iter
|
||||
.chain(self.buffer.iter().cloned())
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -56,7 +56,7 @@ fn get_doc_ids_with_values<'a>(
|
||||
ColumnIndex::Full => Box::new(doc_range),
|
||||
ColumnIndex::Optional(optional_index) => Box::new(
|
||||
optional_index
|
||||
.iter_docs()
|
||||
.iter_non_null_docs()
|
||||
.map(move |row| row + doc_range.start),
|
||||
),
|
||||
ColumnIndex::Multivalued(multivalued_index) => match multivalued_index {
|
||||
@@ -73,7 +73,7 @@ fn get_doc_ids_with_values<'a>(
|
||||
MultiValueIndex::MultiValueIndexV2(multivalued_index) => Box::new(
|
||||
multivalued_index
|
||||
.optional_index
|
||||
.iter_docs()
|
||||
.iter_non_null_docs()
|
||||
.map(move |row| row + doc_range.start),
|
||||
),
|
||||
},
|
||||
@@ -105,10 +105,11 @@ fn get_num_values_iterator<'a>(
|
||||
) -> Box<dyn Iterator<Item = u32> + 'a> {
|
||||
match column_index {
|
||||
ColumnIndex::Empty { .. } => Box::new(std::iter::empty()),
|
||||
ColumnIndex::Full => Box::new(std::iter::repeat(1u32).take(num_docs as usize)),
|
||||
ColumnIndex::Optional(optional_index) => {
|
||||
Box::new(std::iter::repeat(1u32).take(optional_index.num_non_nulls() as usize))
|
||||
}
|
||||
ColumnIndex::Full => Box::new(std::iter::repeat_n(1u32, num_docs as usize)),
|
||||
ColumnIndex::Optional(optional_index) => Box::new(std::iter::repeat_n(
|
||||
1u32,
|
||||
optional_index.num_non_nulls() as usize,
|
||||
)),
|
||||
ColumnIndex::Multivalued(multivalued_index) => Box::new(
|
||||
multivalued_index
|
||||
.get_start_index_column()
|
||||
@@ -177,7 +178,7 @@ impl<'a> Iterable<RowId> for StackedOptionalIndex<'a> {
|
||||
ColumnIndex::Full => Box::new(columnar_row_range),
|
||||
ColumnIndex::Optional(optional_index) => Box::new(
|
||||
optional_index
|
||||
.iter_docs()
|
||||
.iter_non_null_docs()
|
||||
.map(move |row_id: RowId| columnar_row_range.start + row_id),
|
||||
),
|
||||
ColumnIndex::Multivalued(_) => {
|
||||
|
||||
@@ -215,6 +215,32 @@ impl MultiValueIndex {
|
||||
}
|
||||
}
|
||||
|
||||
/// Returns an iterator over document ids that have at least one value.
|
||||
pub fn iter_non_null_docs(&self) -> Box<dyn Iterator<Item = DocId> + '_> {
|
||||
match self {
|
||||
MultiValueIndex::MultiValueIndexV1(idx) => {
|
||||
let mut doc: DocId = 0u32;
|
||||
let num_docs = idx.num_docs();
|
||||
Box::new(std::iter::from_fn(move || {
|
||||
// This is not the most efficient way to do this, but it's legacy code.
|
||||
while doc < num_docs {
|
||||
let cur = doc;
|
||||
doc += 1;
|
||||
let start = idx.start_index_column.get_val(cur);
|
||||
let end = idx.start_index_column.get_val(cur + 1);
|
||||
if end > start {
|
||||
return Some(cur);
|
||||
}
|
||||
}
|
||||
None
|
||||
}))
|
||||
}
|
||||
MultiValueIndex::MultiValueIndexV2(idx) => {
|
||||
Box::new(idx.optional_index.iter_non_null_docs())
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Converts a list of ranks (row ids of values) in a 1:n index to the corresponding list of
|
||||
/// docids. Positions are converted inplace to docids.
|
||||
///
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
use std::io::{self, Write};
|
||||
use std::io;
|
||||
use std::sync::Arc;
|
||||
|
||||
mod set;
|
||||
@@ -11,7 +11,7 @@ use set_block::{
|
||||
};
|
||||
|
||||
use crate::iterable::Iterable;
|
||||
use crate::{DocId, InvalidData, RowId};
|
||||
use crate::{DocId, RowId};
|
||||
|
||||
/// The threshold for for number of elements after which we switch to dense block encoding.
|
||||
///
|
||||
@@ -88,7 +88,7 @@ pub struct OptionalIndex {
|
||||
|
||||
impl Iterable<u32> for &OptionalIndex {
|
||||
fn boxed_iter(&self) -> Box<dyn Iterator<Item = u32> + '_> {
|
||||
Box::new(self.iter_docs())
|
||||
Box::new(self.iter_non_null_docs())
|
||||
}
|
||||
}
|
||||
|
||||
@@ -280,8 +280,9 @@ impl OptionalIndex {
|
||||
self.num_non_null_docs
|
||||
}
|
||||
|
||||
pub fn iter_docs(&self) -> impl Iterator<Item = RowId> + '_ {
|
||||
// TODO optimize
|
||||
pub fn iter_non_null_docs(&self) -> impl Iterator<Item = RowId> + '_ {
|
||||
// TODO optimize. We could iterate over the blocks directly.
|
||||
// We use the dense value ids and retrieve the doc ids via select.
|
||||
let mut select_batch = self.select_cursor();
|
||||
(0..self.num_non_null_docs).map(move |rank| select_batch.select(rank))
|
||||
}
|
||||
@@ -334,38 +335,6 @@ enum Block<'a> {
|
||||
Sparse(SparseBlock<'a>),
|
||||
}
|
||||
|
||||
#[derive(Debug, Copy, Clone)]
|
||||
enum OptionalIndexCodec {
|
||||
Dense = 0,
|
||||
Sparse = 1,
|
||||
}
|
||||
|
||||
impl OptionalIndexCodec {
|
||||
fn to_code(self) -> u8 {
|
||||
self as u8
|
||||
}
|
||||
|
||||
fn try_from_code(code: u8) -> Result<Self, InvalidData> {
|
||||
match code {
|
||||
0 => Ok(Self::Dense),
|
||||
1 => Ok(Self::Sparse),
|
||||
_ => Err(InvalidData),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl BinarySerializable for OptionalIndexCodec {
|
||||
fn serialize<W: Write + ?Sized>(&self, writer: &mut W) -> io::Result<()> {
|
||||
writer.write_all(&[self.to_code()])
|
||||
}
|
||||
|
||||
fn deserialize<R: io::Read>(reader: &mut R) -> io::Result<Self> {
|
||||
let optional_codec_code = u8::deserialize(reader)?;
|
||||
let optional_codec = Self::try_from_code(optional_codec_code)?;
|
||||
Ok(optional_codec)
|
||||
}
|
||||
}
|
||||
|
||||
fn serialize_optional_index_block(block_els: &[u16], out: &mut impl io::Write) -> io::Result<()> {
|
||||
let is_sparse = is_sparse(block_els.len() as u32);
|
||||
if is_sparse {
|
||||
|
||||
@@ -164,7 +164,11 @@ fn test_optional_index_large() {
|
||||
fn test_optional_index_iter_aux(row_ids: &[RowId], num_rows: RowId) {
|
||||
let optional_index = OptionalIndex::for_test(num_rows, row_ids);
|
||||
assert_eq!(optional_index.num_docs(), num_rows);
|
||||
assert!(optional_index.iter_docs().eq(row_ids.iter().copied()));
|
||||
assert!(
|
||||
optional_index
|
||||
.iter_non_null_docs()
|
||||
.eq(row_ids.iter().copied())
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
|
||||
@@ -185,10 +185,10 @@ impl CompactSpaceBuilder {
|
||||
let mut covered_space = Vec::with_capacity(self.blanks.len());
|
||||
|
||||
// beginning of the blanks
|
||||
if let Some(first_blank_start) = self.blanks.first().map(RangeInclusive::start) {
|
||||
if *first_blank_start != 0 {
|
||||
covered_space.push(0..=first_blank_start - 1);
|
||||
}
|
||||
if let Some(first_blank_start) = self.blanks.first().map(RangeInclusive::start)
|
||||
&& *first_blank_start != 0
|
||||
{
|
||||
covered_space.push(0..=first_blank_start - 1);
|
||||
}
|
||||
|
||||
// Between the blanks
|
||||
@@ -202,10 +202,10 @@ impl CompactSpaceBuilder {
|
||||
covered_space.extend(between_blanks);
|
||||
|
||||
// end of the blanks
|
||||
if let Some(last_blank_end) = self.blanks.last().map(RangeInclusive::end) {
|
||||
if *last_blank_end != u128::MAX {
|
||||
covered_space.push(last_blank_end + 1..=u128::MAX);
|
||||
}
|
||||
if let Some(last_blank_end) = self.blanks.last().map(RangeInclusive::end)
|
||||
&& *last_blank_end != u128::MAX
|
||||
{
|
||||
covered_space.push(last_blank_end + 1..=u128::MAX);
|
||||
}
|
||||
|
||||
if covered_space.is_empty() {
|
||||
|
||||
@@ -105,7 +105,7 @@ impl ColumnCodecEstimator for BitpackedCodecEstimator {
|
||||
|
||||
fn estimate(&self, stats: &ColumnStats) -> Option<u64> {
|
||||
let num_bits_per_value = num_bits(stats);
|
||||
Some(stats.num_bytes() + (stats.num_rows as u64 * (num_bits_per_value as u64) + 7) / 8)
|
||||
Some(stats.num_bytes() + (stats.num_rows as u64 * (num_bits_per_value as u64)).div_ceil(8))
|
||||
}
|
||||
|
||||
fn serialize(
|
||||
|
||||
@@ -117,7 +117,7 @@ impl ColumnCodecEstimator for LinearCodecEstimator {
|
||||
Some(
|
||||
stats.num_bytes()
|
||||
+ linear_params.num_bytes()
|
||||
+ (num_bits as u64 * stats.num_rows as u64 + 7) / 8,
|
||||
+ (num_bits as u64 * stats.num_rows as u64).div_ceil(8),
|
||||
)
|
||||
}
|
||||
|
||||
|
||||
@@ -367,7 +367,7 @@ fn is_empty_after_merge(
|
||||
ColumnIndex::Empty { .. } => true,
|
||||
ColumnIndex::Full => alive_bitset.len() == 0,
|
||||
ColumnIndex::Optional(optional_index) => {
|
||||
for doc in optional_index.iter_docs() {
|
||||
for doc in optional_index.iter_non_null_docs() {
|
||||
if alive_bitset.contains(doc) {
|
||||
return false;
|
||||
}
|
||||
|
||||
@@ -244,7 +244,7 @@ impl SymbolValue for UnorderedId {
|
||||
|
||||
fn compute_num_bytes_for_u64(val: u64) -> usize {
|
||||
let msb = (64u32 - val.leading_zeros()) as usize;
|
||||
(msb + 7) / 8
|
||||
msb.div_ceil(8)
|
||||
}
|
||||
|
||||
fn encode_zig_zag(n: i64) -> u64 {
|
||||
|
||||
@@ -1,3 +1,5 @@
|
||||
use std::str::FromStr;
|
||||
|
||||
use common::DateTime;
|
||||
|
||||
use crate::InvalidData;
|
||||
@@ -9,6 +11,23 @@ pub enum NumericalValue {
|
||||
F64(f64),
|
||||
}
|
||||
|
||||
impl FromStr for NumericalValue {
|
||||
type Err = ();
|
||||
|
||||
fn from_str(s: &str) -> Result<Self, ()> {
|
||||
if let Ok(val_i64) = s.parse::<i64>() {
|
||||
return Ok(val_i64.into());
|
||||
}
|
||||
if let Ok(val_u64) = s.parse::<u64>() {
|
||||
return Ok(val_u64.into());
|
||||
}
|
||||
if let Ok(val_f64) = s.parse::<f64>() {
|
||||
return Ok(NumericalValue::from(val_f64).normalize());
|
||||
}
|
||||
Err(())
|
||||
}
|
||||
}
|
||||
|
||||
impl NumericalValue {
|
||||
pub fn numerical_type(&self) -> NumericalType {
|
||||
match self {
|
||||
@@ -26,7 +45,7 @@ impl NumericalValue {
|
||||
if val <= i64::MAX as u64 {
|
||||
NumericalValue::I64(val as i64)
|
||||
} else {
|
||||
NumericalValue::F64(val as f64)
|
||||
NumericalValue::U64(val)
|
||||
}
|
||||
}
|
||||
NumericalValue::I64(val) => NumericalValue::I64(val),
|
||||
@@ -141,6 +160,7 @@ impl Coerce for DateTime {
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::NumericalType;
|
||||
use crate::NumericalValue;
|
||||
|
||||
#[test]
|
||||
fn test_numerical_type_code() {
|
||||
@@ -153,4 +173,58 @@ mod tests {
|
||||
}
|
||||
assert_eq!(num_numerical_type, 3);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_parse_numerical() {
|
||||
assert_eq!(
|
||||
"123".parse::<NumericalValue>().unwrap(),
|
||||
NumericalValue::I64(123)
|
||||
);
|
||||
assert_eq!(
|
||||
"18446744073709551615".parse::<NumericalValue>().unwrap(),
|
||||
NumericalValue::U64(18446744073709551615u64)
|
||||
);
|
||||
assert_eq!(
|
||||
"1.0".parse::<NumericalValue>().unwrap(),
|
||||
NumericalValue::I64(1i64)
|
||||
);
|
||||
assert_eq!(
|
||||
"1.1".parse::<NumericalValue>().unwrap(),
|
||||
NumericalValue::F64(1.1f64)
|
||||
);
|
||||
assert_eq!(
|
||||
"-1.0".parse::<NumericalValue>().unwrap(),
|
||||
NumericalValue::I64(-1i64)
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_normalize_numerical() {
|
||||
assert_eq!(
|
||||
NumericalValue::from(1u64).normalize(),
|
||||
NumericalValue::I64(1i64),
|
||||
);
|
||||
let limit_val = i64::MAX as u64 + 1u64;
|
||||
assert_eq!(
|
||||
NumericalValue::from(limit_val).normalize(),
|
||||
NumericalValue::U64(limit_val),
|
||||
);
|
||||
assert_eq!(
|
||||
NumericalValue::from(-1i64).normalize(),
|
||||
NumericalValue::I64(-1i64),
|
||||
);
|
||||
assert_eq!(
|
||||
NumericalValue::from(-2.0f64).normalize(),
|
||||
NumericalValue::I64(-2i64),
|
||||
);
|
||||
assert_eq!(
|
||||
NumericalValue::from(-2.1f64).normalize(),
|
||||
NumericalValue::F64(-2.1f64),
|
||||
);
|
||||
let large_float = 2.0f64.powf(70.0f64);
|
||||
assert_eq!(
|
||||
NumericalValue::from(large_float).normalize(),
|
||||
NumericalValue::F64(large_float),
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -183,7 +183,7 @@ pub struct BitSet {
|
||||
}
|
||||
|
||||
fn num_buckets(max_val: u32) -> u32 {
|
||||
(max_val + 63u32) / 64u32
|
||||
max_val.div_ceil(64u32)
|
||||
}
|
||||
|
||||
impl BitSet {
|
||||
|
||||
@@ -29,6 +29,7 @@ impl BinarySerializable for VIntU128 {
|
||||
}
|
||||
|
||||
fn deserialize<R: Read>(reader: &mut R) -> io::Result<Self> {
|
||||
#[allow(clippy::unbuffered_bytes)]
|
||||
let mut bytes = reader.bytes();
|
||||
let mut result = 0u128;
|
||||
let mut shift = 0u64;
|
||||
@@ -52,7 +53,7 @@ impl BinarySerializable for VIntU128 {
|
||||
}
|
||||
}
|
||||
|
||||
/// Wrapper over a `u64` that serializes as a variable int.
|
||||
/// Wrapper over a `u64` that serializes as a variable int.
|
||||
#[derive(Clone, Copy, Debug, Eq, PartialEq)]
|
||||
pub struct VInt(pub u64);
|
||||
|
||||
@@ -196,6 +197,7 @@ impl BinarySerializable for VInt {
|
||||
}
|
||||
|
||||
fn deserialize<R: Read>(reader: &mut R) -> io::Result<Self> {
|
||||
#[allow(clippy::unbuffered_bytes)]
|
||||
let mut bytes = reader.bytes();
|
||||
let mut result = 0u64;
|
||||
let mut shift = 0u64;
|
||||
|
||||
@@ -15,3 +15,5 @@ edition = "2024"
|
||||
nom = "7"
|
||||
serde = { version = "1.0.219", features = ["derive"] }
|
||||
serde_json = "1.0.140"
|
||||
ordered-float = "5.0.0"
|
||||
fnv = "1.0.7"
|
||||
|
||||
@@ -117,6 +117,22 @@ where F: nom::Parser<I, (O, ErrorList), Infallible> {
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) fn terminated_infallible<I, O1, O2, F, G>(
|
||||
mut first: F,
|
||||
mut second: G,
|
||||
) -> impl FnMut(I) -> JResult<I, O1>
|
||||
where
|
||||
F: nom::Parser<I, (O1, ErrorList), Infallible>,
|
||||
G: nom::Parser<I, (O2, ErrorList), Infallible>,
|
||||
{
|
||||
move |input: I| {
|
||||
let (input, (o1, mut err)) = first.parse(input)?;
|
||||
let (input, (_, mut err2)) = second.parse(input)?;
|
||||
err.append(&mut err2);
|
||||
Ok((input, (o1, err)))
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) fn delimited_infallible<I, O1, O2, O3, F, G, H>(
|
||||
mut first: F,
|
||||
mut second: G,
|
||||
|
||||
@@ -31,7 +31,17 @@ pub fn parse_query_lenient(query: &str) -> (UserInputAst, Vec<LenientError>) {
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use crate::{parse_query, parse_query_lenient};
|
||||
use crate::{UserInputAst, parse_query, parse_query_lenient};
|
||||
|
||||
#[test]
|
||||
fn test_deduplication() {
|
||||
let ast: UserInputAst = parse_query("a a").unwrap();
|
||||
let json = serde_json::to_string(&ast).unwrap();
|
||||
assert_eq!(
|
||||
json,
|
||||
r#"{"type":"bool","clauses":[[null,{"type":"literal","field_name":null,"phrase":"a","delimiter":"none","slop":0,"prefix":false}]]}"#
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_parse_query_serialization() {
|
||||
|
||||
@@ -1,6 +1,7 @@
|
||||
use std::borrow::Cow;
|
||||
use std::iter::once;
|
||||
|
||||
use fnv::FnvHashSet;
|
||||
use nom::IResult;
|
||||
use nom::branch::alt;
|
||||
use nom::bytes::complete::tag;
|
||||
@@ -68,7 +69,7 @@ fn interpret_escape(source: &str) -> String {
|
||||
|
||||
/// Consume a word outside of any context.
|
||||
// TODO should support escape sequences
|
||||
fn word(inp: &str) -> IResult<&str, Cow<str>> {
|
||||
fn word(inp: &str) -> IResult<&str, Cow<'_, str>> {
|
||||
map_res(
|
||||
recognize(tuple((
|
||||
alt((
|
||||
@@ -305,15 +306,14 @@ fn term_group_infallible(inp: &str) -> JResult<&str, UserInputAst> {
|
||||
let (inp, (field_name, _, _, _)) =
|
||||
tuple((field_name, multispace0, char('('), multispace0))(inp).expect("precondition failed");
|
||||
|
||||
let res = delimited_infallible(
|
||||
delimited_infallible(
|
||||
nothing,
|
||||
map(ast_infallible, |(mut ast, errors)| {
|
||||
ast.set_default_field(field_name.to_string());
|
||||
(ast, errors)
|
||||
}),
|
||||
opt_i_err(char(')'), "expected ')'"),
|
||||
)(inp);
|
||||
res
|
||||
)(inp)
|
||||
}
|
||||
|
||||
fn exists(inp: &str) -> IResult<&str, UserInputLeaf> {
|
||||
@@ -367,7 +367,10 @@ fn literal(inp: &str) -> IResult<&str, UserInputAst> {
|
||||
// something (a field name) got parsed before
|
||||
alt((
|
||||
map(
|
||||
tuple((opt(field_name), alt((range, set, exists, term_or_phrase)))),
|
||||
tuple((
|
||||
opt(field_name),
|
||||
alt((range, set, exists, regex, term_or_phrase)),
|
||||
)),
|
||||
|(field_name, leaf): (Option<String>, UserInputLeaf)| leaf.set_field(field_name).into(),
|
||||
),
|
||||
term_group,
|
||||
@@ -389,6 +392,10 @@ fn literal_no_group_infallible(inp: &str) -> JResult<&str, Option<UserInputAst>>
|
||||
value((), peek(one_of("{[><"))),
|
||||
map(range_infallible, |(range, errs)| (Some(range), errs)),
|
||||
),
|
||||
(
|
||||
value((), peek(one_of("/"))),
|
||||
map(regex_infallible, |(regex, errs)| (Some(regex), errs)),
|
||||
),
|
||||
),
|
||||
delimited_infallible(space0_infallible, term_or_phrase_infallible, nothing),
|
||||
),
|
||||
@@ -689,6 +696,61 @@ fn set_infallible(mut inp: &str) -> JResult<&str, UserInputLeaf> {
|
||||
}
|
||||
}
|
||||
|
||||
fn regex(inp: &str) -> IResult<&str, UserInputLeaf> {
|
||||
map(
|
||||
terminated(
|
||||
delimited(
|
||||
char('/'),
|
||||
many1(alt((preceded(char('\\'), char('/')), none_of("/")))),
|
||||
char('/'),
|
||||
),
|
||||
peek(alt((multispace1, eof))),
|
||||
),
|
||||
|elements| UserInputLeaf::Regex {
|
||||
field: None,
|
||||
pattern: elements.into_iter().collect::<String>(),
|
||||
},
|
||||
)(inp)
|
||||
}
|
||||
|
||||
fn regex_infallible(inp: &str) -> JResult<&str, UserInputLeaf> {
|
||||
match terminated_infallible(
|
||||
delimited_infallible(
|
||||
opt_i_err(char('/'), "missing delimiter /"),
|
||||
opt_i(many1(alt((preceded(char('\\'), char('/')), none_of("/"))))),
|
||||
opt_i_err(char('/'), "missing delimiter /"),
|
||||
),
|
||||
opt_i_err(
|
||||
peek(alt((multispace1, eof))),
|
||||
"expected whitespace or end of input",
|
||||
),
|
||||
)(inp)
|
||||
{
|
||||
Ok((rest, (elements_part, errors))) => {
|
||||
let pattern = match elements_part {
|
||||
Some(elements_part) => elements_part.into_iter().collect(),
|
||||
None => String::new(),
|
||||
};
|
||||
let res = UserInputLeaf::Regex {
|
||||
field: None,
|
||||
pattern,
|
||||
};
|
||||
Ok((rest, (res, errors)))
|
||||
}
|
||||
Err(e) => {
|
||||
let errs = vec![LenientErrorInternal {
|
||||
pos: inp.len(),
|
||||
message: e.to_string(),
|
||||
}];
|
||||
let res = UserInputLeaf::Regex {
|
||||
field: None,
|
||||
pattern: String::new(),
|
||||
};
|
||||
Ok((inp, (res, errs)))
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
fn negate(expr: UserInputAst) -> UserInputAst {
|
||||
expr.unary(Occur::MustNot)
|
||||
}
|
||||
@@ -753,7 +815,7 @@ fn boosted_leaf(inp: &str) -> IResult<&str, UserInputAst> {
|
||||
tuple((leaf, fallible(boost))),
|
||||
|(leaf, boost_opt)| match boost_opt {
|
||||
Some(boost) if (boost - 1.0).abs() > f64::EPSILON => {
|
||||
UserInputAst::Boost(Box::new(leaf), boost)
|
||||
UserInputAst::Boost(Box::new(leaf), boost.into())
|
||||
}
|
||||
_ => leaf,
|
||||
},
|
||||
@@ -765,7 +827,7 @@ fn boosted_leaf_infallible(inp: &str) -> JResult<&str, Option<UserInputAst>> {
|
||||
tuple_infallible((leaf_infallible, boost)),
|
||||
|((leaf, boost_opt), error)| match boost_opt {
|
||||
Some(boost) if (boost - 1.0).abs() > f64::EPSILON => (
|
||||
leaf.map(|leaf| UserInputAst::Boost(Box::new(leaf), boost)),
|
||||
leaf.map(|leaf| UserInputAst::Boost(Box::new(leaf), boost.into())),
|
||||
error,
|
||||
),
|
||||
_ => (leaf, error),
|
||||
@@ -1016,12 +1078,25 @@ pub fn parse_to_ast_lenient(query_str: &str) -> (UserInputAst, Vec<LenientError>
|
||||
(rewrite_ast(res), errors)
|
||||
}
|
||||
|
||||
/// Removes unnecessary children clauses in AST
|
||||
///
|
||||
/// Motivated by [issue #1433](https://github.com/quickwit-oss/tantivy/issues/1433)
|
||||
fn rewrite_ast(mut input: UserInputAst) -> UserInputAst {
|
||||
if let UserInputAst::Clause(terms) = &mut input {
|
||||
for term in terms {
|
||||
if let UserInputAst::Clause(sub_clauses) = &mut input {
|
||||
// call rewrite_ast recursively on children clauses if applicable
|
||||
let mut new_clauses = Vec::with_capacity(sub_clauses.len());
|
||||
for (occur, clause) in sub_clauses.drain(..) {
|
||||
let rewritten_clause = rewrite_ast(clause);
|
||||
new_clauses.push((occur, rewritten_clause));
|
||||
}
|
||||
*sub_clauses = new_clauses;
|
||||
|
||||
// remove duplicate child clauses
|
||||
// e.g. (+a +b) OR (+c +d) OR (+a +b) => (+a +b) OR (+c +d)
|
||||
let mut seen = FnvHashSet::default();
|
||||
sub_clauses.retain(|term| seen.insert(term.clone()));
|
||||
|
||||
// Removes unnecessary children clauses in AST
|
||||
//
|
||||
// Motivated by [issue #1433](https://github.com/quickwit-oss/tantivy/issues/1433)
|
||||
for term in sub_clauses {
|
||||
rewrite_ast_clause(term);
|
||||
}
|
||||
}
|
||||
@@ -1694,6 +1769,63 @@ mod test {
|
||||
test_is_parse_err(r#"!bc:def"#, "!bc:def");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_regex_parser() {
|
||||
let r = parse_to_ast(r#"a:/joh?n(ath[oa]n)/"#);
|
||||
assert!(r.is_ok(), "Failed to parse custom query: {r:?}");
|
||||
let (_, input) = r.unwrap();
|
||||
match input {
|
||||
UserInputAst::Leaf(leaf) => match leaf.as_ref() {
|
||||
UserInputLeaf::Regex { field, pattern } => {
|
||||
assert_eq!(field, &Some("a".to_string()));
|
||||
assert_eq!(pattern, "joh?n(ath[oa]n)");
|
||||
}
|
||||
_ => panic!("Expected a regex leaf, got {leaf:?}"),
|
||||
},
|
||||
_ => panic!("Expected a leaf"),
|
||||
}
|
||||
let r = parse_to_ast(r#"a:/\\/cgi-bin\\/luci.*/"#);
|
||||
assert!(r.is_ok(), "Failed to parse custom query: {r:?}");
|
||||
let (_, input) = r.unwrap();
|
||||
match input {
|
||||
UserInputAst::Leaf(leaf) => match leaf.as_ref() {
|
||||
UserInputLeaf::Regex { field, pattern } => {
|
||||
assert_eq!(field, &Some("a".to_string()));
|
||||
assert_eq!(pattern, "\\/cgi-bin\\/luci.*");
|
||||
}
|
||||
_ => panic!("Expected a regex leaf, got {leaf:?}"),
|
||||
},
|
||||
_ => panic!("Expected a leaf"),
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_regex_parser_lenient() {
|
||||
let literal = |query| literal_infallible(query).unwrap().1;
|
||||
|
||||
let (res, errs) = literal(r#"a:/joh?n(ath[oa]n)/"#);
|
||||
let expected = UserInputLeaf::Regex {
|
||||
field: Some("a".to_string()),
|
||||
pattern: "joh?n(ath[oa]n)".to_string(),
|
||||
}
|
||||
.into();
|
||||
assert_eq!(res.unwrap(), expected);
|
||||
assert!(errs.is_empty(), "Expected no errors, got: {errs:?}");
|
||||
|
||||
let (res, errs) = literal("title:/joh?n(ath[oa]n)");
|
||||
let expected = UserInputLeaf::Regex {
|
||||
field: Some("title".to_string()),
|
||||
pattern: "joh?n(ath[oa]n)".to_string(),
|
||||
}
|
||||
.into();
|
||||
assert_eq!(res.unwrap(), expected);
|
||||
assert_eq!(errs.len(), 1, "Expected 1 error, got: {errs:?}");
|
||||
assert_eq!(
|
||||
errs[0].message, "missing delimiter /",
|
||||
"Unexpected error message",
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_space_before_value() {
|
||||
test_parse_query_to_ast_helper("field : a", r#""field":a"#);
|
||||
|
||||
@@ -5,7 +5,7 @@ use serde::Serialize;
|
||||
|
||||
use crate::Occur;
|
||||
|
||||
#[derive(PartialEq, Clone, Serialize)]
|
||||
#[derive(PartialEq, Eq, Hash, Clone, Serialize)]
|
||||
#[serde(tag = "type")]
|
||||
#[serde(rename_all = "snake_case")]
|
||||
pub enum UserInputLeaf {
|
||||
@@ -23,6 +23,10 @@ pub enum UserInputLeaf {
|
||||
Exists {
|
||||
field: String,
|
||||
},
|
||||
Regex {
|
||||
field: Option<String>,
|
||||
pattern: String,
|
||||
},
|
||||
}
|
||||
|
||||
impl UserInputLeaf {
|
||||
@@ -46,6 +50,7 @@ impl UserInputLeaf {
|
||||
UserInputLeaf::Exists { field: _ } => UserInputLeaf::Exists {
|
||||
field: field.expect("Exist query without a field isn't allowed"),
|
||||
},
|
||||
UserInputLeaf::Regex { field: _, pattern } => UserInputLeaf::Regex { field, pattern },
|
||||
}
|
||||
}
|
||||
|
||||
@@ -103,11 +108,19 @@ impl Debug for UserInputLeaf {
|
||||
UserInputLeaf::Exists { field } => {
|
||||
write!(formatter, "$exists(\"{field}\")")
|
||||
}
|
||||
UserInputLeaf::Regex { field, pattern } => {
|
||||
if let Some(field) = field {
|
||||
// TODO properly escape field (in case of \")
|
||||
write!(formatter, "\"{field}\":")?;
|
||||
}
|
||||
// TODO properly escape pattern (in case of \")
|
||||
write!(formatter, "/{pattern}/")
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Copy, Clone, Eq, PartialEq, Debug, Serialize)]
|
||||
#[derive(Copy, Clone, Eq, PartialEq, Hash, Debug, Serialize)]
|
||||
#[serde(rename_all = "snake_case")]
|
||||
pub enum Delimiter {
|
||||
SingleQuotes,
|
||||
@@ -115,7 +128,7 @@ pub enum Delimiter {
|
||||
None,
|
||||
}
|
||||
|
||||
#[derive(PartialEq, Clone, Serialize)]
|
||||
#[derive(PartialEq, Eq, Hash, Clone, Serialize)]
|
||||
#[serde(rename_all = "snake_case")]
|
||||
pub struct UserInputLiteral {
|
||||
pub field_name: Option<String>,
|
||||
@@ -154,7 +167,7 @@ impl fmt::Debug for UserInputLiteral {
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(PartialEq, Debug, Clone, Serialize)]
|
||||
#[derive(PartialEq, Eq, Hash, Debug, Clone, Serialize)]
|
||||
#[serde(tag = "type", content = "value")]
|
||||
#[serde(rename_all = "snake_case")]
|
||||
pub enum UserInputBound {
|
||||
@@ -191,11 +204,11 @@ impl UserInputBound {
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(PartialEq, Clone, Serialize)]
|
||||
#[derive(PartialEq, Eq, Hash, Clone, Serialize)]
|
||||
#[serde(into = "UserInputAstSerde")]
|
||||
pub enum UserInputAst {
|
||||
Clause(Vec<(Option<Occur>, UserInputAst)>),
|
||||
Boost(Box<UserInputAst>, f64),
|
||||
Boost(Box<UserInputAst>, ordered_float::OrderedFloat<f64>),
|
||||
Leaf(Box<UserInputLeaf>),
|
||||
}
|
||||
|
||||
@@ -217,9 +230,10 @@ impl From<UserInputAst> for UserInputAstSerde {
|
||||
fn from(ast: UserInputAst) -> Self {
|
||||
match ast {
|
||||
UserInputAst::Clause(clause) => UserInputAstSerde::Bool { clauses: clause },
|
||||
UserInputAst::Boost(underlying, boost) => {
|
||||
UserInputAstSerde::Boost { underlying, boost }
|
||||
}
|
||||
UserInputAst::Boost(underlying, boost) => UserInputAstSerde::Boost {
|
||||
underlying,
|
||||
boost: boost.into_inner(),
|
||||
},
|
||||
UserInputAst::Leaf(leaf) => UserInputAstSerde::Leaf(leaf),
|
||||
}
|
||||
}
|
||||
@@ -378,7 +392,7 @@ mod tests {
|
||||
#[test]
|
||||
fn test_boost_serialization() {
|
||||
let inner_ast = UserInputAst::Leaf(Box::new(UserInputLeaf::All));
|
||||
let boost_ast = UserInputAst::Boost(Box::new(inner_ast), 2.5);
|
||||
let boost_ast = UserInputAst::Boost(Box::new(inner_ast), 2.5.into());
|
||||
let json = serde_json::to_string(&boost_ast).unwrap();
|
||||
assert_eq!(
|
||||
json,
|
||||
@@ -405,7 +419,7 @@ mod tests {
|
||||
}))),
|
||||
),
|
||||
])),
|
||||
2.5,
|
||||
2.5.into(),
|
||||
);
|
||||
let json = serde_json::to_string(&boost_ast).unwrap();
|
||||
assert_eq!(
|
||||
|
||||
@@ -155,7 +155,7 @@ fn test_aggregation_flushing(
|
||||
searcher.search(&AllQuery, &collector).unwrap()
|
||||
};
|
||||
|
||||
let res: Value = serde_json::to_value(&agg_res)?;
|
||||
let res: Value = serde_json::from_str(&serde_json::to_string(&agg_res)?)?;
|
||||
|
||||
assert_eq!(res["bucketsL1"]["buckets"][0]["doc_count"], 3);
|
||||
assert_eq!(
|
||||
@@ -270,7 +270,7 @@ fn test_aggregation_level1_simple() -> crate::Result<()> {
|
||||
let searcher = reader.searcher();
|
||||
let agg_res: AggregationResults = searcher.search(&term_query, &collector).unwrap();
|
||||
|
||||
let res: Value = serde_json::to_value(&agg_res)?;
|
||||
let res: Value = serde_json::from_str(&serde_json::to_string(&agg_res)?)?;
|
||||
assert_eq!(res["average"]["value"], 12.142857142857142);
|
||||
assert_eq!(
|
||||
res["range"]["buckets"],
|
||||
@@ -304,29 +304,6 @@ fn test_aggregation_level1_simple() -> crate::Result<()> {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_aggregation_term_truncate_sum_other_doc_count() {
|
||||
let index = get_test_index_2_segments(true).unwrap();
|
||||
let reader = index.reader().unwrap();
|
||||
let count_per_text: Aggregation = serde_json::from_value(json!({ "terms": { "field": "text", "size": 1 } })).unwrap();
|
||||
let aggs: Aggregations = vec![("group_by_term_truncate".to_string(), count_per_text)]
|
||||
.into_iter()
|
||||
.collect();
|
||||
|
||||
let collector = get_collector(aggs);
|
||||
let searcher = reader.searcher();
|
||||
let agg_res: AggregationResults = searcher.search(&AllQuery, &collector).unwrap();
|
||||
|
||||
let res: Value = serde_json::to_value(&agg_res).unwrap();
|
||||
assert_eq!(res, serde_json::json!({
|
||||
"group_by_term_truncate": {
|
||||
"buckets": [{ "doc_count": 7, "key": "cool" }],
|
||||
"doc_count_error_upper_bound": 0,
|
||||
"sum_other_doc_count": 2,
|
||||
},
|
||||
}));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_aggregation_level1() -> crate::Result<()> {
|
||||
let index = get_test_index_2_segments(true)?;
|
||||
@@ -365,7 +342,7 @@ fn test_aggregation_level1() -> crate::Result<()> {
|
||||
let searcher = reader.searcher();
|
||||
let agg_res: AggregationResults = searcher.search(&term_query, &collector).unwrap();
|
||||
|
||||
let res: Value = serde_json::to_value(&agg_res)?;
|
||||
let res: Value = serde_json::from_str(&serde_json::to_string(&agg_res)?)?;
|
||||
assert_eq!(res["average"]["value"], 12.142857142857142);
|
||||
assert_eq!(res["average_f64"]["value"], 12.214285714285714);
|
||||
assert_eq!(res["average_i64"]["value"], 12.142857142857142);
|
||||
@@ -420,7 +397,7 @@ fn test_aggregation_level2(
|
||||
IndexRecordOption::Basic,
|
||||
);
|
||||
|
||||
let elasticsearch_compatible_json_req = serde_json::json!(
|
||||
let elasticsearch_compatible_json_req = r#"
|
||||
{
|
||||
"rangef64": {
|
||||
"range": {
|
||||
@@ -473,8 +450,9 @@ fn test_aggregation_level2(
|
||||
"term_agg": { "terms": { "field": "text" } }
|
||||
}
|
||||
}
|
||||
});
|
||||
let agg_req: Aggregations = serde_json::from_value(elasticsearch_compatible_json_req).unwrap();
|
||||
}
|
||||
"#;
|
||||
let agg_req: Aggregations = serde_json::from_str(elasticsearch_compatible_json_req).unwrap();
|
||||
|
||||
let agg_res: AggregationResults = if use_distributed_collector {
|
||||
let collector =
|
||||
@@ -491,7 +469,7 @@ fn test_aggregation_level2(
|
||||
searcher.search(&term_query, &collector).unwrap()
|
||||
};
|
||||
|
||||
let res: Value = serde_json::to_value(agg_res)?;
|
||||
let res: Value = serde_json::from_str(&serde_json::to_string(&agg_res)?)?;
|
||||
|
||||
assert_eq!(res["range"]["buckets"][1]["key"], "3-7");
|
||||
assert_eq!(res["range"]["buckets"][1]["doc_count"], 2u64);
|
||||
|
||||
@@ -301,7 +301,7 @@ impl SegmentAggregationCollector for SegmentHistogramCollector {
|
||||
let bounds = self.bounds;
|
||||
let interval = self.interval;
|
||||
let offset = self.offset;
|
||||
let get_bucket_pos = |val| (get_bucket_pos_f64(val, interval, offset) as i64);
|
||||
let get_bucket_pos = |val| get_bucket_pos_f64(val, interval, offset) as i64;
|
||||
|
||||
bucket_agg_accessor
|
||||
.column_block_accessor
|
||||
|
||||
@@ -484,7 +484,6 @@ impl FacetCounts {
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use std::collections::BTreeSet;
|
||||
use std::iter;
|
||||
|
||||
use columnar::Dictionary;
|
||||
use rand::distributions::Uniform;
|
||||
|
||||
@@ -1,3 +1,4 @@
|
||||
use columnar::NumericalValue;
|
||||
use common::json_path_writer::{JSON_END_OF_PATH, JSON_PATH_SEGMENT_SEP};
|
||||
use common::{replace_in_place, JsonPathWriter};
|
||||
use rustc_hash::FxHashMap;
|
||||
@@ -152,7 +153,7 @@ pub(crate) fn index_json_value<'a, V: Value<'a>>(
|
||||
if let Ok(i64_val) = val.try_into() {
|
||||
term_buffer.append_type_and_fast_value::<i64>(i64_val);
|
||||
} else {
|
||||
term_buffer.append_type_and_fast_value(val);
|
||||
term_buffer.append_type_and_fast_value::<u64>(val);
|
||||
}
|
||||
postings_writer.subscribe(doc, 0u32, term_buffer, ctx);
|
||||
}
|
||||
@@ -166,12 +167,30 @@ pub(crate) fn index_json_value<'a, V: Value<'a>>(
|
||||
postings_writer.subscribe(doc, 0u32, term_buffer, ctx);
|
||||
}
|
||||
ReferenceValueLeaf::F64(val) => {
|
||||
if !val.is_finite() {
|
||||
return;
|
||||
};
|
||||
set_path_id(
|
||||
term_buffer,
|
||||
ctx.path_to_unordered_id
|
||||
.get_or_allocate_unordered_id(json_path_writer.as_str()),
|
||||
);
|
||||
term_buffer.append_type_and_fast_value(val);
|
||||
// Normalize here is important.
|
||||
// In the inverted index, we coerce all numerical values to their canonical
|
||||
// representation.
|
||||
//
|
||||
// (We do the same thing on the query side)
|
||||
match NumericalValue::F64(val).normalize() {
|
||||
NumericalValue::I64(val_i64) => {
|
||||
term_buffer.append_type_and_fast_value::<i64>(val_i64);
|
||||
}
|
||||
NumericalValue::U64(val_u64) => {
|
||||
term_buffer.append_type_and_fast_value::<u64>(val_u64);
|
||||
}
|
||||
NumericalValue::F64(val_f64) => {
|
||||
term_buffer.append_type_and_fast_value::<f64>(val_f64);
|
||||
}
|
||||
}
|
||||
postings_writer.subscribe(doc, 0u32, term_buffer, ctx);
|
||||
}
|
||||
ReferenceValueLeaf::Bool(val) => {
|
||||
@@ -241,8 +260,8 @@ pub(crate) fn index_json_value<'a, V: Value<'a>>(
|
||||
///
|
||||
/// The term must be json + JSON path.
|
||||
pub fn convert_to_fast_value_and_append_to_json_term(
|
||||
mut term: Term,
|
||||
phrase: &str,
|
||||
term: &Term,
|
||||
text: &str,
|
||||
truncate_date_for_search: bool,
|
||||
) -> Option<Term> {
|
||||
assert_eq!(
|
||||
@@ -254,31 +273,50 @@ pub fn convert_to_fast_value_and_append_to_json_term(
|
||||
0,
|
||||
"JSON value bytes should be empty"
|
||||
);
|
||||
if let Ok(dt) = OffsetDateTime::parse(phrase, &Rfc3339) {
|
||||
let mut dt = DateTime::from_utc(dt.to_offset(UtcOffset::UTC));
|
||||
if truncate_date_for_search {
|
||||
dt = dt.truncate(DATE_TIME_PRECISION_INDEXED);
|
||||
try_convert_to_datetime_and_append_to_json_term(term, text, truncate_date_for_search)
|
||||
.or_else(|| try_convert_to_number_and_append_to_json_term(term, text))
|
||||
.or_else(|| try_convert_to_bool_and_append_to_json_term_typed(term, text))
|
||||
}
|
||||
|
||||
fn try_convert_to_datetime_and_append_to_json_term(
|
||||
term: &Term,
|
||||
text: &str,
|
||||
truncate_date_for_search: bool,
|
||||
) -> Option<Term> {
|
||||
let dt = OffsetDateTime::parse(text, &Rfc3339).ok()?;
|
||||
let mut dt = DateTime::from_utc(dt.to_offset(UtcOffset::UTC));
|
||||
if truncate_date_for_search {
|
||||
dt = dt.truncate(DATE_TIME_PRECISION_INDEXED);
|
||||
}
|
||||
let mut term_clone = term.clone();
|
||||
term_clone.append_type_and_fast_value(dt);
|
||||
Some(term_clone)
|
||||
}
|
||||
|
||||
fn try_convert_to_number_and_append_to_json_term(term: &Term, text: &str) -> Option<Term> {
|
||||
let numerical_value: NumericalValue = str::parse::<NumericalValue>(text).ok()?;
|
||||
let mut term_clone = term.clone();
|
||||
// Parse is actually returning normalized values already today, but let's not
|
||||
// not rely on that hidden contract.
|
||||
match numerical_value.normalize() {
|
||||
NumericalValue::I64(i64_value) => {
|
||||
term_clone.append_type_and_fast_value::<i64>(i64_value);
|
||||
}
|
||||
NumericalValue::U64(u64_value) => {
|
||||
term_clone.append_type_and_fast_value::<u64>(u64_value);
|
||||
}
|
||||
NumericalValue::F64(f64_value) => {
|
||||
term_clone.append_type_and_fast_value::<f64>(f64_value);
|
||||
}
|
||||
term.append_type_and_fast_value(dt);
|
||||
return Some(term);
|
||||
}
|
||||
if let Ok(i64_val) = str::parse::<i64>(phrase) {
|
||||
term.append_type_and_fast_value(i64_val);
|
||||
return Some(term);
|
||||
}
|
||||
if let Ok(u64_val) = str::parse::<u64>(phrase) {
|
||||
term.append_type_and_fast_value(u64_val);
|
||||
return Some(term);
|
||||
}
|
||||
if let Ok(f64_val) = str::parse::<f64>(phrase) {
|
||||
term.append_type_and_fast_value(f64_val);
|
||||
return Some(term);
|
||||
}
|
||||
if let Ok(bool_val) = str::parse::<bool>(phrase) {
|
||||
term.append_type_and_fast_value(bool_val);
|
||||
return Some(term);
|
||||
}
|
||||
None
|
||||
Some(term_clone)
|
||||
}
|
||||
|
||||
fn try_convert_to_bool_and_append_to_json_term_typed(term: &Term, text: &str) -> Option<Term> {
|
||||
let val = str::parse::<bool>(text).ok()?;
|
||||
let mut term_clone = term.clone();
|
||||
term_clone.append_type_and_fast_value(val);
|
||||
Some(term_clone)
|
||||
}
|
||||
|
||||
/// Splits a json path supplied to the query parser in such a way that
|
||||
|
||||
@@ -484,10 +484,8 @@ impl Directory for MmapDirectory {
|
||||
.map_err(LockError::wrap_io_error)?;
|
||||
if lock.is_blocking {
|
||||
file.lock_exclusive().map_err(LockError::wrap_io_error)?;
|
||||
} else {
|
||||
if !file.try_lock_exclusive().map_err(|_| LockError::LockBusy)? {
|
||||
return Err(LockError::LockBusy);
|
||||
}
|
||||
} else if !file.try_lock_exclusive().map_err(|_| LockError::LockBusy)? {
|
||||
return Err(LockError::LockBusy);
|
||||
}
|
||||
// dropping the file handle will release the lock.
|
||||
Ok(DirectoryLock::from(Box::new(ReleaseLockFile {
|
||||
|
||||
@@ -146,7 +146,7 @@ impl InvertedIndexReader {
|
||||
positions_size: ByteCount::default(),
|
||||
num_terms: 0u64,
|
||||
};
|
||||
field_space.record(&term_info);
|
||||
field_space.record(term_info);
|
||||
|
||||
// We include the json type and the json end of path to make sure the prefix check
|
||||
// is meaningful.
|
||||
|
||||
@@ -615,7 +615,7 @@ impl<D: Document> IndexWriter<D> {
|
||||
/// It is also possible to add a payload to the `commit`
|
||||
/// using this API.
|
||||
/// See [`PreparedCommit::set_payload()`].
|
||||
pub fn prepare_commit(&mut self) -> crate::Result<PreparedCommit<D>> {
|
||||
pub fn prepare_commit(&mut self) -> crate::Result<PreparedCommit<'_, D>> {
|
||||
// Here, because we join all of the worker threads,
|
||||
// all of the segment update for this commit have been
|
||||
// sent.
|
||||
|
||||
51
src/lib.rs
51
src/lib.rs
@@ -55,7 +55,7 @@
|
||||
//! // between indexing threads.
|
||||
//! let mut index_writer: IndexWriter = index.writer(100_000_000)?;
|
||||
//!
|
||||
//! // Let's index one documents!
|
||||
//! // Let's index a document!
|
||||
//! index_writer.add_document(doc!(
|
||||
//! title => "The Old Man and the Sea",
|
||||
//! body => "He was an old man who fished alone in a skiff in \
|
||||
@@ -370,6 +370,8 @@ macro_rules! fail_point {
|
||||
/// Common test utilities.
|
||||
#[cfg(test)]
|
||||
pub mod tests {
|
||||
use std::collections::BTreeMap;
|
||||
|
||||
use common::{BinarySerializable, FixedSize};
|
||||
use query_grammar::{UserInputAst, UserInputLeaf, UserInputLiteral};
|
||||
use rand::distributions::{Bernoulli, Uniform};
|
||||
@@ -382,7 +384,7 @@ pub mod tests {
|
||||
use crate::index::SegmentReader;
|
||||
use crate::merge_policy::NoMergePolicy;
|
||||
use crate::postings::Postings;
|
||||
use crate::query::BooleanQuery;
|
||||
use crate::query::{BooleanQuery, QueryParser};
|
||||
use crate::schema::*;
|
||||
use crate::{DateTime, DocAddress, Index, IndexWriter, ReloadPolicy};
|
||||
|
||||
@@ -1223,4 +1225,49 @@ pub mod tests {
|
||||
);
|
||||
assert_eq!(dt_from_ts_nanos.to_hms_micro(), offset_dt.to_hms_micro());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_json_number_ambiguity() {
|
||||
let mut schema_builder = Schema::builder();
|
||||
let json_field = schema_builder.add_json_field("number", crate::schema::TEXT);
|
||||
let schema = schema_builder.build();
|
||||
let index = Index::create_in_ram(schema);
|
||||
let mut index_writer = index.writer_for_tests().unwrap();
|
||||
{
|
||||
let mut doc = TantivyDocument::new();
|
||||
let mut obj = BTreeMap::default();
|
||||
obj.insert("key".to_string(), OwnedValue::I64(1i64));
|
||||
doc.add_object(json_field, obj);
|
||||
index_writer.add_document(doc).unwrap();
|
||||
}
|
||||
{
|
||||
let mut doc = TantivyDocument::new();
|
||||
let mut obj = BTreeMap::default();
|
||||
obj.insert("key".to_string(), OwnedValue::U64(1u64));
|
||||
doc.add_object(json_field, obj);
|
||||
index_writer.add_document(doc).unwrap();
|
||||
}
|
||||
{
|
||||
let mut doc = TantivyDocument::new();
|
||||
let mut obj = BTreeMap::default();
|
||||
obj.insert("key".to_string(), OwnedValue::F64(1.0f64));
|
||||
doc.add_object(json_field, obj);
|
||||
index_writer.add_document(doc).unwrap();
|
||||
}
|
||||
index_writer.commit().unwrap();
|
||||
let searcher = index.reader().unwrap().searcher();
|
||||
assert_eq!(searcher.num_docs(), 3);
|
||||
{
|
||||
let parser = QueryParser::for_index(&index, vec![]);
|
||||
let query = parser.parse_query("number.key:1").unwrap();
|
||||
let count = searcher.search(&query, &crate::collector::Count).unwrap();
|
||||
assert_eq!(count, 3);
|
||||
}
|
||||
{
|
||||
let parser = QueryParser::for_index(&index, vec![]);
|
||||
let query = parser.parse_query("number.key:1.0").unwrap();
|
||||
let count = searcher.search(&query, &crate::collector::Count).unwrap();
|
||||
assert_eq!(count, 3);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -227,19 +227,6 @@ impl BlockSegmentPostings {
|
||||
self.doc_decoder.output_array()
|
||||
}
|
||||
|
||||
/// Returns a full block, regardless of whether the block is complete or incomplete (
|
||||
/// as it happens for the last block of the posting list).
|
||||
///
|
||||
/// In the latter case, the block is guaranteed to be padded with the sentinel value:
|
||||
/// `TERMINATED`. The array is also guaranteed to be aligned on 16 bytes = 128 bits.
|
||||
///
|
||||
/// This method is useful to run SSE2 linear search.
|
||||
#[inline]
|
||||
pub(crate) fn full_block(&self) -> &[DocId; COMPRESSION_BLOCK_SIZE] {
|
||||
debug_assert!(self.block_is_loaded());
|
||||
self.doc_decoder.full_output()
|
||||
}
|
||||
|
||||
/// Return the document at index `idx` of the block.
|
||||
#[inline]
|
||||
pub fn doc(&self, idx: usize) -> u32 {
|
||||
@@ -275,22 +262,36 @@ impl BlockSegmentPostings {
|
||||
///
|
||||
/// If all docs are smaller than target, the block loaded may be empty,
|
||||
/// or be the last an incomplete VInt block.
|
||||
pub fn seek(&mut self, target_doc: DocId) {
|
||||
self.shallow_seek(target_doc);
|
||||
pub fn seek(&mut self, target_doc: DocId) -> usize {
|
||||
// Move to the block that might contain our document.
|
||||
self.seek_block(target_doc);
|
||||
self.load_block();
|
||||
|
||||
// At this point we are on the block that might contain our document.
|
||||
let doc = self.doc_decoder.seek_within_block(target_doc);
|
||||
|
||||
// The last block is not full and padded with TERMINATED,
|
||||
// so we are guaranteed to have at least one value (real or padding)
|
||||
// that is >= target_doc.
|
||||
debug_assert!(doc < COMPRESSION_BLOCK_SIZE);
|
||||
|
||||
// `doc` is now the first element >= `target_doc`.
|
||||
// If all docs are smaller than target, the current block is incomplete and padded
|
||||
// with TERMINATED. After the search, the cursor points to the first TERMINATED.
|
||||
doc
|
||||
}
|
||||
|
||||
pub(crate) fn position_offset(&self) -> u64 {
|
||||
self.skip_reader.position_offset()
|
||||
}
|
||||
|
||||
/// Dangerous API! This calls seek on the skip list,
|
||||
/// Dangerous API! This calls seeks the next block on the skip list,
|
||||
/// but does not `.load_block()` afterwards.
|
||||
///
|
||||
/// `.load_block()` needs to be called manually afterwards.
|
||||
/// If all docs are smaller than target, the block loaded may be empty,
|
||||
/// or be the last an incomplete VInt block.
|
||||
pub(crate) fn shallow_seek(&mut self, target_doc: DocId) {
|
||||
pub(crate) fn seek_block(&mut self, target_doc: DocId) {
|
||||
if self.skip_reader.seek(target_doc) {
|
||||
self.block_max_score_cache = None;
|
||||
self.block_loaded = false;
|
||||
|
||||
@@ -151,9 +151,11 @@ impl BlockDecoder {
|
||||
&self.output[..self.output_len]
|
||||
}
|
||||
|
||||
/// Return in-block index of first value >= `target`.
|
||||
/// Uses the padded buffer to enable branchless search.
|
||||
#[inline]
|
||||
pub(crate) fn full_output(&self) -> &[u32; COMPRESSION_BLOCK_SIZE] {
|
||||
&self.output
|
||||
pub(crate) fn seek_within_block(&self, target: u32) -> usize {
|
||||
crate::postings::branchless_binary_search(&self.output, target)
|
||||
}
|
||||
|
||||
#[inline]
|
||||
|
||||
@@ -4,7 +4,7 @@ use crate::docset::DocSet;
|
||||
use crate::fastfield::AliveBitSet;
|
||||
use crate::positions::PositionReader;
|
||||
use crate::postings::compression::COMPRESSION_BLOCK_SIZE;
|
||||
use crate::postings::{branchless_binary_search, BlockSegmentPostings, Postings};
|
||||
use crate::postings::{BlockSegmentPostings, Postings};
|
||||
use crate::{DocId, TERMINATED};
|
||||
|
||||
/// `SegmentPostings` represents the inverted list or postings associated with
|
||||
@@ -175,26 +175,11 @@ impl DocSet for SegmentPostings {
|
||||
return self.doc();
|
||||
}
|
||||
|
||||
self.block_cursor.seek(target);
|
||||
|
||||
// At this point we are on the block, that might contain our document.
|
||||
let output = self.block_cursor.full_block();
|
||||
self.cur = branchless_binary_search(output, target);
|
||||
|
||||
// The last block is not full and padded with the value TERMINATED,
|
||||
// so that we are guaranteed to have at least doc in the block (a real one or the padding)
|
||||
// that is greater or equal to the target.
|
||||
debug_assert!(self.cur < COMPRESSION_BLOCK_SIZE);
|
||||
|
||||
// `doc` is now the first element >= `target`
|
||||
|
||||
// If all docs are smaller than target the current block should be incomplemented and padded
|
||||
// with the value `TERMINATED`.
|
||||
//
|
||||
// After the search, the cursor should point to the first value of TERMINATED.
|
||||
let doc = output[self.cur];
|
||||
// Delegate block-local search to BlockSegmentPostings::seek, which returns
|
||||
// the in-block index of the first doc >= target.
|
||||
self.cur = self.block_cursor.seek(target);
|
||||
let doc = self.doc();
|
||||
debug_assert!(doc >= target);
|
||||
debug_assert_eq!(doc, self.doc());
|
||||
doc
|
||||
}
|
||||
|
||||
|
||||
@@ -75,7 +75,7 @@ impl InvertedIndexSerializer {
|
||||
field: Field,
|
||||
total_num_tokens: u64,
|
||||
fieldnorm_reader: Option<FieldNormReader>,
|
||||
) -> io::Result<FieldSerializer> {
|
||||
) -> io::Result<FieldSerializer<'_>> {
|
||||
let field_entry: &FieldEntry = self.schema.get_field_entry(field);
|
||||
let term_dictionary_write = self.terms_write.for_field(field);
|
||||
let postings_write = self.postings_write.for_field(field);
|
||||
@@ -126,7 +126,7 @@ impl<'a> FieldSerializer<'a> {
|
||||
let term_dictionary_builder = TermDictionaryBuilder::create(term_dictionary_write)?;
|
||||
let average_fieldnorm = fieldnorm_reader
|
||||
.as_ref()
|
||||
.map(|ff_reader| (total_num_tokens as Score / ff_reader.num_docs() as Score))
|
||||
.map(|ff_reader| total_num_tokens as Score / ff_reader.num_docs() as Score)
|
||||
.unwrap_or(0.0);
|
||||
let postings_serializer = PostingsSerializer::new(
|
||||
postings_write,
|
||||
|
||||
@@ -1,5 +1,3 @@
|
||||
use serde::{Deserialize, Serialize};
|
||||
|
||||
use crate::fieldnorm::FieldNormReader;
|
||||
use crate::query::Explanation;
|
||||
use crate::schema::Field;
|
||||
@@ -68,12 +66,6 @@ fn compute_tf_cache(average_fieldnorm: Score) -> [Score; 256] {
|
||||
cache
|
||||
}
|
||||
|
||||
#[derive(Clone, PartialEq, Debug, Serialize, Deserialize)]
|
||||
pub struct Bm25Params {
|
||||
pub idf: Score,
|
||||
pub avg_fieldnorm: Score,
|
||||
}
|
||||
|
||||
/// A struct used for computing BM25 scores.
|
||||
#[derive(Clone)]
|
||||
pub struct Bm25Weight {
|
||||
|
||||
@@ -167,7 +167,7 @@ pub fn block_wand(
|
||||
let block_max_score_upperbound: Score = scorers[..pivot_len]
|
||||
.iter_mut()
|
||||
.map(|scorer| {
|
||||
scorer.shallow_seek(pivot_doc);
|
||||
scorer.seek_block(pivot_doc);
|
||||
scorer.block_max_score()
|
||||
})
|
||||
.sum();
|
||||
@@ -234,7 +234,7 @@ pub fn block_wand_single_scorer(
|
||||
return;
|
||||
}
|
||||
doc = last_doc_in_block + 1;
|
||||
scorer.shallow_seek(doc);
|
||||
scorer.seek_block(doc);
|
||||
}
|
||||
// Seek will effectively load that block.
|
||||
doc = scorer.seek(doc);
|
||||
@@ -256,7 +256,7 @@ pub fn block_wand_single_scorer(
|
||||
}
|
||||
}
|
||||
doc += 1;
|
||||
scorer.shallow_seek(doc);
|
||||
scorer.seek_block(doc);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -302,7 +302,6 @@ fn is_sorted<I: Iterator<Item = DocId>>(mut it: I) -> bool {
|
||||
mod tests {
|
||||
use std::cmp::Ordering;
|
||||
use std::collections::BinaryHeap;
|
||||
use std::iter;
|
||||
|
||||
use proptest::prelude::*;
|
||||
|
||||
|
||||
@@ -1,12 +1,15 @@
|
||||
use core::fmt::Debug;
|
||||
|
||||
use columnar::{ColumnIndex, DynamicColumn};
|
||||
use common::BitSet;
|
||||
|
||||
use super::{ConstScorer, EmptyScorer};
|
||||
use crate::docset::{DocSet, TERMINATED};
|
||||
use crate::index::SegmentReader;
|
||||
use crate::query::all_query::AllScorer;
|
||||
use crate::query::boost_query::BoostScorer;
|
||||
use crate::query::explanation::does_not_match;
|
||||
use crate::query::{EnableScoring, Explanation, Query, Scorer, Weight};
|
||||
use crate::query::{BitSetDocSet, EnableScoring, Explanation, Query, Scorer, Weight};
|
||||
use crate::schema::Type;
|
||||
use crate::{DocId, Score, TantivyError};
|
||||
|
||||
@@ -113,13 +116,49 @@ impl Weight for ExistsWeight {
|
||||
non_empty_columns.push(column)
|
||||
}
|
||||
}
|
||||
// TODO: we can optimizer more here since in most cases we will have only one index
|
||||
if !non_empty_columns.is_empty() {
|
||||
let docset = ExistsDocSet::new(non_empty_columns, reader.max_doc());
|
||||
Ok(Box::new(ConstScorer::new(docset, boost)))
|
||||
} else {
|
||||
Ok(Box::new(EmptyScorer))
|
||||
if non_empty_columns.is_empty() {
|
||||
return Ok(Box::new(EmptyScorer));
|
||||
}
|
||||
|
||||
// If any column is full, all docs match.
|
||||
let max_doc = reader.max_doc();
|
||||
if non_empty_columns
|
||||
.iter()
|
||||
.any(|col| matches!(col.column_index(), ColumnIndex::Full))
|
||||
{
|
||||
let all_scorer = AllScorer::new(max_doc);
|
||||
return Ok(Box::new(BoostScorer::new(all_scorer, boost)));
|
||||
}
|
||||
|
||||
// If we have a single dynamic column, use ExistsDocSet
|
||||
// NOTE: A lower number may be better for very sparse columns
|
||||
if non_empty_columns.len() < 4 {
|
||||
let docset = ExistsDocSet::new(non_empty_columns, reader.max_doc());
|
||||
return Ok(Box::new(ConstScorer::new(docset, boost)));
|
||||
}
|
||||
|
||||
// If we have many dynamic columns, precompute a bitset of matching docs
|
||||
let mut doc_bitset = BitSet::with_max_value(max_doc);
|
||||
for column in &non_empty_columns {
|
||||
match column.column_index() {
|
||||
ColumnIndex::Empty { .. } => {}
|
||||
ColumnIndex::Full => {
|
||||
// Handled by AllScorer return above.
|
||||
}
|
||||
ColumnIndex::Optional(optional_index) => {
|
||||
for doc in optional_index.iter_non_null_docs() {
|
||||
doc_bitset.insert(doc);
|
||||
}
|
||||
}
|
||||
ColumnIndex::Multivalued(multi_idx) => {
|
||||
for doc in multi_idx.iter_non_null_docs() {
|
||||
doc_bitset.insert(doc);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
let docset = BitSetDocSet::from(doc_bitset);
|
||||
Ok(Box::new(ConstScorer::new(docset, boost)))
|
||||
}
|
||||
|
||||
fn explain(&self, reader: &SegmentReader, doc: DocId) -> crate::Result<Explanation> {
|
||||
@@ -294,6 +333,43 @@ mod tests {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_exists_query_json_union_no_single_full_subpath() -> crate::Result<()> {
|
||||
// Build docs where no single subpath exists for all docs, but the union does.
|
||||
let mut schema_builder = Schema::builder();
|
||||
let json = schema_builder.add_json_field("json", TEXT | FAST);
|
||||
let schema = schema_builder.build();
|
||||
|
||||
let index = Index::create_in_ram(schema);
|
||||
{
|
||||
let mut index_writer = index.writer_for_tests()?;
|
||||
for i in 0u64..100u64 {
|
||||
if i % 2 == 0 {
|
||||
// only subpath `a`
|
||||
index_writer.add_document(doc!(json => json!({"a": i})))?;
|
||||
} else {
|
||||
// only subpath `b`
|
||||
index_writer.add_document(doc!(json => json!({"b": i})))?;
|
||||
}
|
||||
}
|
||||
index_writer.commit()?;
|
||||
}
|
||||
let reader = index.reader()?;
|
||||
let searcher = reader.searcher();
|
||||
|
||||
// No single subpath is full
|
||||
assert_eq!(count_existing_fields(&searcher, "json.a", false)?, 50);
|
||||
assert_eq!(count_existing_fields(&searcher, "json.b", false)?, 50);
|
||||
|
||||
// Root exists with subpaths disabled is zero
|
||||
assert_eq!(count_existing_fields(&searcher, "json", false)?, 0);
|
||||
|
||||
// Root exists with subpaths enabled should match all docs via union
|
||||
assert_eq!(count_existing_fields(&searcher, "json", true)?, 100);
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_exists_query_misc_supported_types() -> crate::Result<()> {
|
||||
let mut schema_builder = Schema::builder();
|
||||
|
||||
@@ -104,7 +104,7 @@ mod tests {
|
||||
let query = query_parser.parse_query("a a a a a").unwrap();
|
||||
let mut terms = Vec::new();
|
||||
query.query_terms(&mut |term, pos| terms.push((term, pos)));
|
||||
assert_eq!(vec![(&term_a, false); 5], terms);
|
||||
assert_eq!(vec![(&term_a, false); 1], terms);
|
||||
}
|
||||
{
|
||||
let query = query_parser.parse_query("a -b").unwrap();
|
||||
|
||||
@@ -1,8 +1,11 @@
|
||||
use std::fmt;
|
||||
use std::ops::Bound;
|
||||
use std::sync::Arc;
|
||||
|
||||
use tantivy_fst::Regex;
|
||||
|
||||
use crate::query::Occur;
|
||||
use crate::schema::Term;
|
||||
use crate::schema::{Field, Term};
|
||||
use crate::Score;
|
||||
|
||||
#[derive(Clone)]
|
||||
@@ -21,6 +24,10 @@ pub enum LogicalLiteral {
|
||||
elements: Vec<Term>,
|
||||
},
|
||||
All,
|
||||
Regex {
|
||||
pattern: Arc<Regex>,
|
||||
field: Field,
|
||||
},
|
||||
}
|
||||
|
||||
pub enum LogicalAst {
|
||||
@@ -38,6 +45,7 @@ impl LogicalAst {
|
||||
}
|
||||
}
|
||||
|
||||
// TODO: Move to rewrite_ast in query_grammar
|
||||
pub fn simplify(self) -> LogicalAst {
|
||||
match self {
|
||||
LogicalAst::Clause(clauses) => {
|
||||
@@ -147,6 +155,10 @@ impl fmt::Debug for LogicalLiteral {
|
||||
write!(formatter, "]")
|
||||
}
|
||||
LogicalLiteral::All => write!(formatter, "*"),
|
||||
LogicalLiteral::Regex {
|
||||
ref pattern,
|
||||
ref field,
|
||||
} => write!(formatter, "Regex({field:?}, {pattern:?})"),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -2,12 +2,14 @@ use std::net::{AddrParseError, IpAddr};
|
||||
use std::num::{ParseFloatError, ParseIntError};
|
||||
use std::ops::Bound;
|
||||
use std::str::{FromStr, ParseBoolError};
|
||||
use std::sync::Arc;
|
||||
|
||||
use base64::engine::general_purpose::STANDARD as BASE64;
|
||||
use base64::Engine;
|
||||
use itertools::Itertools;
|
||||
use query_grammar::{UserInputAst, UserInputBound, UserInputLeaf, UserInputLiteral};
|
||||
use rustc_hash::FxHashMap;
|
||||
use tantivy_fst::Regex;
|
||||
|
||||
use super::logical_ast::*;
|
||||
use crate::index::Index;
|
||||
@@ -15,7 +17,7 @@ use crate::json_utils::convert_to_fast_value_and_append_to_json_term;
|
||||
use crate::query::range_query::{is_type_valid_for_fastfield_range_query, RangeQuery};
|
||||
use crate::query::{
|
||||
AllQuery, BooleanQuery, BoostQuery, EmptyQuery, FuzzyTermQuery, Occur, PhrasePrefixQuery,
|
||||
PhraseQuery, Query, TermQuery, TermSetQuery,
|
||||
PhraseQuery, Query, RegexQuery, TermQuery, TermSetQuery,
|
||||
};
|
||||
use crate::schema::{
|
||||
Facet, FacetParseError, Field, FieldType, IndexRecordOption, IntoIpv6Addr, JsonObjectOptions,
|
||||
@@ -206,6 +208,7 @@ pub struct QueryParser {
|
||||
tokenizer_manager: TokenizerManager,
|
||||
boost: FxHashMap<Field, Score>,
|
||||
fuzzy: FxHashMap<Field, Fuzzy>,
|
||||
regexes_allowed: bool,
|
||||
}
|
||||
|
||||
#[derive(Clone)]
|
||||
@@ -260,6 +263,7 @@ impl QueryParser {
|
||||
conjunction_by_default: false,
|
||||
boost: Default::default(),
|
||||
fuzzy: Default::default(),
|
||||
regexes_allowed: false,
|
||||
}
|
||||
}
|
||||
|
||||
@@ -320,6 +324,11 @@ impl QueryParser {
|
||||
);
|
||||
}
|
||||
|
||||
/// Allow regexes in queries
|
||||
pub fn allow_regexes(&mut self) {
|
||||
self.regexes_allowed = true;
|
||||
}
|
||||
|
||||
/// Parse a query
|
||||
///
|
||||
/// Note that `parse_query` returns an error if the input
|
||||
@@ -486,24 +495,17 @@ impl QueryParser {
|
||||
Ok(terms.into_iter().next().unwrap())
|
||||
}
|
||||
FieldType::JsonObject(ref json_options) => {
|
||||
let get_term_with_path = || {
|
||||
Term::from_field_json_path(
|
||||
field,
|
||||
json_path,
|
||||
json_options.is_expand_dots_enabled(),
|
||||
)
|
||||
};
|
||||
let mut term = Term::from_field_json_path(
|
||||
field,
|
||||
json_path,
|
||||
json_options.is_expand_dots_enabled(),
|
||||
);
|
||||
if let Some(term) =
|
||||
// Try to convert the phrase to a fast value
|
||||
convert_to_fast_value_and_append_to_json_term(
|
||||
get_term_with_path(),
|
||||
phrase,
|
||||
false,
|
||||
)
|
||||
convert_to_fast_value_and_append_to_json_term(&term, phrase, false)
|
||||
{
|
||||
Ok(term)
|
||||
} else {
|
||||
let mut term = get_term_with_path();
|
||||
term.append_type_and_str(phrase);
|
||||
Ok(term)
|
||||
}
|
||||
@@ -670,7 +672,7 @@ impl QueryParser {
|
||||
}
|
||||
UserInputAst::Boost(ast, boost) => {
|
||||
let (ast, errors) = self.compute_logical_ast_with_occur_lenient(*ast);
|
||||
(ast.boost(boost as Score), errors)
|
||||
(ast.boost(boost.into_inner() as Score), errors)
|
||||
}
|
||||
UserInputAst::Leaf(leaf) => {
|
||||
let (ast, errors) = self.compute_logical_ast_from_leaf_lenient(*leaf);
|
||||
@@ -860,6 +862,51 @@ impl QueryParser {
|
||||
"Range query need to target a specific field.".to_string(),
|
||||
)],
|
||||
),
|
||||
UserInputLeaf::Regex { field, pattern } => {
|
||||
if !self.regexes_allowed {
|
||||
return (
|
||||
None,
|
||||
vec![QueryParserError::UnsupportedQuery(
|
||||
"Regex queries are not allowed.".to_string(),
|
||||
)],
|
||||
);
|
||||
}
|
||||
let full_path = try_tuple!(field.ok_or_else(|| {
|
||||
QueryParserError::UnsupportedQuery(
|
||||
"Regex query need to target a specific field.".to_string(),
|
||||
)
|
||||
}));
|
||||
let (field, json_path) = try_tuple!(self
|
||||
.split_full_path(&full_path)
|
||||
.ok_or_else(|| QueryParserError::FieldDoesNotExist(full_path.clone())));
|
||||
if !json_path.is_empty() {
|
||||
return (
|
||||
None,
|
||||
vec![QueryParserError::UnsupportedQuery(
|
||||
"Regex query does not support json paths.".to_string(),
|
||||
)],
|
||||
);
|
||||
}
|
||||
if !matches!(
|
||||
self.schema.get_field_entry(field).field_type(),
|
||||
FieldType::Str(_)
|
||||
) {
|
||||
return (
|
||||
None,
|
||||
vec![QueryParserError::UnsupportedQuery(
|
||||
"Regex query only supported on text fields".to_string(),
|
||||
)],
|
||||
);
|
||||
}
|
||||
let pattern = try_tuple!(Regex::new(&pattern).map_err(|e| {
|
||||
QueryParserError::UnsupportedQuery(format!("Invalid regex: {e}"))
|
||||
}));
|
||||
let logical_ast = LogicalAst::Leaf(Box::new(LogicalLiteral::Regex {
|
||||
pattern: Arc::new(pattern),
|
||||
field,
|
||||
}));
|
||||
(Some(logical_ast), Vec::new())
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -902,6 +949,9 @@ fn convert_literal_to_query(
|
||||
LogicalLiteral::Range { lower, upper } => Box::new(RangeQuery::new(lower, upper)),
|
||||
LogicalLiteral::Set { elements, .. } => Box::new(TermSetQuery::new(elements)),
|
||||
LogicalLiteral::All => Box::new(AllQuery),
|
||||
LogicalLiteral::Regex { pattern, field } => {
|
||||
Box::new(RegexQuery::from_regex(pattern, field))
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -971,7 +1021,7 @@ fn generate_literals_for_json_object(
|
||||
|
||||
// Try to convert the phrase to a fast value
|
||||
if let Some(term) =
|
||||
convert_to_fast_value_and_append_to_json_term(get_term_with_path(), phrase, true)
|
||||
convert_to_fast_value_and_append_to_json_term(&get_term_with_path(), phrase, true)
|
||||
{
|
||||
logical_literals.push(LogicalLiteral::Term(term));
|
||||
}
|
||||
@@ -1100,11 +1150,15 @@ mod test {
|
||||
query: &str,
|
||||
default_conjunction: bool,
|
||||
default_fields: &[&'static str],
|
||||
allow_regexes: bool,
|
||||
) -> Result<LogicalAst, QueryParserError> {
|
||||
let mut query_parser = make_query_parser_with_default_fields(default_fields);
|
||||
if default_conjunction {
|
||||
query_parser.set_conjunction_by_default();
|
||||
}
|
||||
if allow_regexes {
|
||||
query_parser.allow_regexes();
|
||||
}
|
||||
query_parser.parse_query_to_logical_ast(query)
|
||||
}
|
||||
|
||||
@@ -1116,6 +1170,7 @@ mod test {
|
||||
query,
|
||||
default_conjunction,
|
||||
&["title", "text"],
|
||||
true,
|
||||
)
|
||||
}
|
||||
|
||||
@@ -1130,6 +1185,7 @@ mod test {
|
||||
query,
|
||||
default_conjunction,
|
||||
default_fields,
|
||||
true,
|
||||
)
|
||||
.unwrap();
|
||||
let query_str = format!("{query:?}");
|
||||
@@ -1993,4 +2049,66 @@ mod test {
|
||||
Err(QueryParserError::ExpectedInt(_))
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
pub fn test_deduplication() {
|
||||
let query = "be be";
|
||||
test_parse_query_to_logical_ast_helper(
|
||||
query,
|
||||
"(Term(field=0, type=Str, \"be\") Term(field=1, type=Str, \"be\"))",
|
||||
false,
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
pub fn test_regex() {
|
||||
let expected_regex = tantivy_fst::Regex::new(r".*b").unwrap();
|
||||
test_parse_query_to_logical_ast_helper(
|
||||
"title:/.*b/",
|
||||
format!("Regex(Field(0), {:#?})", expected_regex).as_str(),
|
||||
false,
|
||||
);
|
||||
|
||||
// Invalid field
|
||||
let err = parse_query_to_logical_ast("float:/.*b/", false).unwrap_err();
|
||||
assert_eq!(
|
||||
err.to_string(),
|
||||
"Unsupported query: Regex query only supported on text fields"
|
||||
);
|
||||
|
||||
// No field specified
|
||||
let err = parse_query_to_logical_ast("/.*b/", false).unwrap_err();
|
||||
assert_eq!(
|
||||
err.to_string(),
|
||||
"Unsupported query: Regex query need to target a specific field."
|
||||
);
|
||||
|
||||
// Regex on a json path
|
||||
let err = parse_query_to_logical_ast("title.subpath:/.*b/", false).unwrap_err();
|
||||
assert_eq!(
|
||||
err.to_string(),
|
||||
"Unsupported query: Regex query does not support json paths."
|
||||
);
|
||||
|
||||
// Invalid regex
|
||||
let err = parse_query_to_logical_ast("title:/[A-Z*b/", false).unwrap_err();
|
||||
assert_eq!(
|
||||
err.to_string(),
|
||||
"Unsupported query: Invalid regex: regex parse error:\n [A-Z*b\n ^\nerror: \
|
||||
unclosed character class"
|
||||
);
|
||||
|
||||
// Regexes not allowed
|
||||
let err = parse_query_to_logical_ast_with_default_fields(
|
||||
"title:/.*b/",
|
||||
false,
|
||||
&["title", "text"],
|
||||
false,
|
||||
)
|
||||
.unwrap_err();
|
||||
assert_eq!(
|
||||
err.to_string(),
|
||||
"Unsupported query: Regex queries are not allowed."
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -12,10 +12,14 @@ pub use self::range_query_fastfield::*;
|
||||
// TODO is this correct?
|
||||
pub(crate) fn is_type_valid_for_fastfield_range_query(typ: Type) -> bool {
|
||||
match typ {
|
||||
Type::Str | Type::U64 | Type::I64 | Type::F64 | Type::Bool | Type::Date | Type::Json => {
|
||||
true
|
||||
}
|
||||
Type::IpAddr => true,
|
||||
Type::Str
|
||||
| Type::U64
|
||||
| Type::I64
|
||||
| Type::F64
|
||||
| Type::Bool
|
||||
| Type::Date
|
||||
| Type::Json
|
||||
| Type::IpAddr => true,
|
||||
Type::Facet | Type::Bytes => false,
|
||||
}
|
||||
}
|
||||
|
||||
@@ -258,7 +258,7 @@ fn search_on_json_numerical_field(
|
||||
|
||||
let bounds = match typ.numerical_type().unwrap() {
|
||||
NumericalType::I64 => {
|
||||
let bounds = bounds.map_bound(|term| (term.as_i64().unwrap()));
|
||||
let bounds = bounds.map_bound(|term| term.as_i64().unwrap());
|
||||
match actual_column_type {
|
||||
NumericalType::I64 => bounds.map_bound(|&term| term.to_u64()),
|
||||
NumericalType::U64 => {
|
||||
@@ -282,7 +282,7 @@ fn search_on_json_numerical_field(
|
||||
}
|
||||
}
|
||||
NumericalType::U64 => {
|
||||
let bounds = bounds.map_bound(|term| (term.as_u64().unwrap()));
|
||||
let bounds = bounds.map_bound(|term| term.as_u64().unwrap());
|
||||
match actual_column_type {
|
||||
NumericalType::U64 => bounds.map_bound(|&term| term.to_u64()),
|
||||
NumericalType::I64 => {
|
||||
@@ -306,7 +306,7 @@ fn search_on_json_numerical_field(
|
||||
}
|
||||
}
|
||||
NumericalType::F64 => {
|
||||
let bounds = bounds.map_bound(|term| (term.as_f64().unwrap()));
|
||||
let bounds = bounds.map_bound(|term| term.as_f64().unwrap());
|
||||
match actual_column_type {
|
||||
NumericalType::U64 => transform_from_f64_bounds::<u64>(&bounds),
|
||||
NumericalType::I64 => transform_from_f64_bounds::<i64>(&bounds),
|
||||
|
||||
@@ -11,7 +11,7 @@ mod tests {
|
||||
use crate::docset::DocSet;
|
||||
use crate::postings::compression::COMPRESSION_BLOCK_SIZE;
|
||||
use crate::query::{EnableScoring, Query, QueryParser, Scorer, TermQuery};
|
||||
use crate::schema::{Field, IndexRecordOption, Schema, STRING, TEXT};
|
||||
use crate::schema::{Field, IndexRecordOption, Schema, FAST, STRING, TEXT};
|
||||
use crate::{assert_nearly_equals, DocAddress, Index, IndexWriter, Term, TERMINATED};
|
||||
|
||||
#[test]
|
||||
@@ -212,4 +212,232 @@ mod tests {
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_term_query_fallback_to_fastfield() -> crate::Result<()> {
|
||||
use crate::collector::Count;
|
||||
use crate::schema::FAST;
|
||||
|
||||
// Create a FAST-only numeric field (not indexed)
|
||||
let mut schema_builder = Schema::builder();
|
||||
let num_field = schema_builder.add_u64_field("num", FAST);
|
||||
let schema = schema_builder.build();
|
||||
let index = Index::create_in_ram(schema);
|
||||
|
||||
{
|
||||
let mut index_writer: IndexWriter = index.writer_for_tests()?;
|
||||
index_writer.add_document(doc!(num_field => 10u64))?;
|
||||
index_writer.add_document(doc!(num_field => 20u64))?;
|
||||
index_writer.add_document(doc!(num_field => 10u64))?;
|
||||
index_writer.commit()?;
|
||||
}
|
||||
|
||||
let reader = index.reader()?;
|
||||
let searcher = reader.searcher();
|
||||
|
||||
// TermQuery should fall back to a fastfield range query and match correctly.
|
||||
let tq_10 = TermQuery::new(
|
||||
Term::from_field_u64(num_field, 10u64),
|
||||
IndexRecordOption::Basic,
|
||||
);
|
||||
let tq_20 = TermQuery::new(
|
||||
Term::from_field_u64(num_field, 20u64),
|
||||
IndexRecordOption::Basic,
|
||||
);
|
||||
let tq_30 = TermQuery::new(
|
||||
Term::from_field_u64(num_field, 30u64),
|
||||
IndexRecordOption::Basic,
|
||||
);
|
||||
|
||||
let count_10 = searcher.search(&tq_10, &Count)?;
|
||||
let count_20 = searcher.search(&tq_20, &Count)?;
|
||||
let count_30 = searcher.search(&tq_30, &Count)?;
|
||||
|
||||
assert_eq!(count_10, 2);
|
||||
assert_eq!(count_20, 1);
|
||||
assert_eq!(count_30, 0);
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_term_query_fallback_text_fast_only() -> crate::Result<()> {
|
||||
use crate::collector::Count;
|
||||
|
||||
// FAST-only text field (not indexed)
|
||||
let mut schema_builder = Schema::builder();
|
||||
let text_field = schema_builder.add_text_field("text", FAST);
|
||||
let schema = schema_builder.build();
|
||||
let index = Index::create_in_ram(schema);
|
||||
|
||||
{
|
||||
let mut index_writer: IndexWriter = index.writer_for_tests()?;
|
||||
index_writer.add_document(doc!(text_field => "hello"))?;
|
||||
index_writer.add_document(doc!(text_field => "world"))?;
|
||||
index_writer.add_document(doc!(text_field => "hello"))?;
|
||||
index_writer.commit()?;
|
||||
}
|
||||
|
||||
let searcher = index.reader()?.searcher();
|
||||
let tq_hello = TermQuery::new(
|
||||
Term::from_field_text(text_field, "hello"),
|
||||
IndexRecordOption::Basic,
|
||||
);
|
||||
let tq_world = TermQuery::new(
|
||||
Term::from_field_text(text_field, "world"),
|
||||
IndexRecordOption::Basic,
|
||||
);
|
||||
let tq_missing = TermQuery::new(
|
||||
Term::from_field_text(text_field, "nope"),
|
||||
IndexRecordOption::Basic,
|
||||
);
|
||||
|
||||
assert_eq!(searcher.search(&tq_hello, &Count)?, 2);
|
||||
assert_eq!(searcher.search(&tq_world, &Count)?, 1);
|
||||
assert_eq!(searcher.search(&tq_missing, &Count)?, 0);
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_term_query_fallback_json_fast_only() -> crate::Result<()> {
|
||||
use crate::collector::Count;
|
||||
use crate::fastfield::FastValue;
|
||||
use crate::schema::FAST;
|
||||
|
||||
let mut schema_builder = Schema::builder();
|
||||
let json_field = schema_builder.add_json_field("json", FAST);
|
||||
let schema = schema_builder.build();
|
||||
let index = Index::create_in_ram(schema.clone());
|
||||
|
||||
{
|
||||
let mut index_writer: IndexWriter = index.writer_for_tests()?;
|
||||
index_writer.add_document(doc!(json_field => json!({"a": 10, "b": "x"})))?;
|
||||
index_writer.add_document(doc!(json_field => json!({"a": 20, "b": "y"})))?;
|
||||
index_writer.add_document(doc!(json_field => json!({"a": 10, "b": "z"})))?;
|
||||
index_writer.commit()?;
|
||||
}
|
||||
|
||||
fn json_term_fast<T: FastValue>(field: Field, path: &str, v: T) -> Term {
|
||||
let mut term = Term::from_field_json_path(field, path, true);
|
||||
term.append_type_and_fast_value(v);
|
||||
term
|
||||
}
|
||||
fn json_term_str(field: Field, path: &str, v: &str) -> Term {
|
||||
let mut term = Term::from_field_json_path(field, path, true);
|
||||
term.append_type_and_str(v);
|
||||
term
|
||||
}
|
||||
|
||||
let searcher = index.reader()?.searcher();
|
||||
// numeric path match
|
||||
let tq_a10 = TermQuery::new(
|
||||
json_term_fast(json_field, "a", 10u64),
|
||||
IndexRecordOption::Basic,
|
||||
);
|
||||
let tq_a20 = TermQuery::new(
|
||||
json_term_fast(json_field, "a", 20u64),
|
||||
IndexRecordOption::Basic,
|
||||
);
|
||||
let tq_a30 = TermQuery::new(
|
||||
json_term_fast(json_field, "a", 30u64),
|
||||
IndexRecordOption::Basic,
|
||||
);
|
||||
assert_eq!(searcher.search(&tq_a10, &Count)?, 2);
|
||||
assert_eq!(searcher.search(&tq_a20, &Count)?, 1);
|
||||
assert_eq!(searcher.search(&tq_a30, &Count)?, 0);
|
||||
|
||||
// string path match
|
||||
let tq_bx = TermQuery::new(
|
||||
json_term_str(json_field, "b", "x"),
|
||||
IndexRecordOption::Basic,
|
||||
);
|
||||
let tq_by = TermQuery::new(
|
||||
json_term_str(json_field, "b", "y"),
|
||||
IndexRecordOption::Basic,
|
||||
);
|
||||
let tq_bm = TermQuery::new(
|
||||
json_term_str(json_field, "b", "missing"),
|
||||
IndexRecordOption::Basic,
|
||||
);
|
||||
assert_eq!(searcher.search(&tq_bx, &Count)?, 1);
|
||||
assert_eq!(searcher.search(&tq_by, &Count)?, 1);
|
||||
assert_eq!(searcher.search(&tq_bm, &Count)?, 0);
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_term_query_fallback_ip_fast_only() -> crate::Result<()> {
|
||||
use std::net::IpAddr;
|
||||
use std::str::FromStr;
|
||||
|
||||
use crate::collector::Count;
|
||||
use crate::schema::{IntoIpv6Addr, FAST};
|
||||
|
||||
let mut schema_builder = Schema::builder();
|
||||
let ip_field = schema_builder.add_ip_addr_field("ip", FAST);
|
||||
let schema = schema_builder.build();
|
||||
let index = Index::create_in_ram(schema);
|
||||
|
||||
let ip1 = IpAddr::from_str("127.0.0.1").unwrap().into_ipv6_addr();
|
||||
let ip2 = IpAddr::from_str("127.0.0.2").unwrap().into_ipv6_addr();
|
||||
{
|
||||
let mut index_writer: IndexWriter = index.writer_for_tests()?;
|
||||
index_writer.add_document(doc!(ip_field => ip1))?;
|
||||
index_writer.add_document(doc!(ip_field => ip2))?;
|
||||
index_writer.add_document(doc!(ip_field => ip1))?;
|
||||
index_writer.commit()?;
|
||||
}
|
||||
|
||||
let searcher = index.reader()?.searcher();
|
||||
let tq_ip1 = TermQuery::new(
|
||||
Term::from_field_ip_addr(ip_field, ip1),
|
||||
IndexRecordOption::Basic,
|
||||
);
|
||||
let tq_ip2 = TermQuery::new(
|
||||
Term::from_field_ip_addr(ip_field, ip2),
|
||||
IndexRecordOption::Basic,
|
||||
);
|
||||
let ip3 = IpAddr::from_str("127.0.0.3").unwrap().into_ipv6_addr();
|
||||
let tq_ip3 = TermQuery::new(
|
||||
Term::from_field_ip_addr(ip_field, ip3),
|
||||
IndexRecordOption::Basic,
|
||||
);
|
||||
|
||||
assert_eq!(searcher.search(&tq_ip1, &Count)?, 2);
|
||||
assert_eq!(searcher.search(&tq_ip2, &Count)?, 1);
|
||||
assert_eq!(searcher.search(&tq_ip3, &Count)?, 0);
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_term_query_fallback_fastfield_with_scores_errors() -> crate::Result<()> {
|
||||
use crate::collector::TopDocs;
|
||||
|
||||
// FAST-only numeric field (not indexed) should error when scoring is required
|
||||
let mut schema_builder = Schema::builder();
|
||||
let num_field = schema_builder.add_u64_field("num", FAST);
|
||||
let schema = schema_builder.build();
|
||||
let index = Index::create_in_ram(schema);
|
||||
|
||||
{
|
||||
let mut index_writer: IndexWriter = index.writer_for_tests()?;
|
||||
index_writer.add_document(doc!(num_field => 10u64))?;
|
||||
index_writer.add_document(doc!(num_field => 20u64))?;
|
||||
index_writer.commit()?;
|
||||
}
|
||||
|
||||
let searcher = index.reader()?.searcher();
|
||||
let tq = TermQuery::new(
|
||||
Term::from_field_u64(num_field, 10u64),
|
||||
IndexRecordOption::Basic,
|
||||
);
|
||||
|
||||
// Using TopDocs requires scoring; since the field is not indexed,
|
||||
// TermQuery cannot score and should return a SchemaError.
|
||||
let res = searcher.search(&tq, &TopDocs::with_limit(1));
|
||||
assert!(matches!(res, Err(crate::TantivyError::SchemaError(_))));
|
||||
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1,8 +1,10 @@
|
||||
use std::fmt;
|
||||
use std::ops::Bound;
|
||||
|
||||
use super::term_weight::TermWeight;
|
||||
use crate::query::bm25::Bm25Weight;
|
||||
use crate::query::{EnableScoring, Explanation, Query, Weight};
|
||||
use crate::query::range_query::is_type_valid_for_fastfield_range_query;
|
||||
use crate::query::{EnableScoring, Explanation, Query, RangeQuery, Weight};
|
||||
use crate::schema::IndexRecordOption;
|
||||
use crate::Term;
|
||||
|
||||
@@ -99,7 +101,7 @@ impl TermQuery {
|
||||
EnableScoring::Enabled {
|
||||
statistics_provider,
|
||||
..
|
||||
} => Bm25Weight::for_terms(statistics_provider, &[self.term.clone()])?,
|
||||
} => Bm25Weight::for_terms(statistics_provider, std::slice::from_ref(&self.term))?,
|
||||
EnableScoring::Disabled { .. } => {
|
||||
Bm25Weight::new(Explanation::new("<no score>", 1.0f32), 1.0f32)
|
||||
}
|
||||
@@ -122,6 +124,24 @@ impl TermQuery {
|
||||
|
||||
impl Query for TermQuery {
|
||||
fn weight(&self, enable_scoring: EnableScoring<'_>) -> crate::Result<Box<dyn Weight>> {
|
||||
// If the field is not indexed but is a suitable fast field, fall back to a range query
|
||||
// on the fast field matching exactly this term.
|
||||
//
|
||||
// Note: This is considerable slower since it requires to scan the entire fast field.
|
||||
// TODO: The range query would gain from having a single-value optimization
|
||||
let schema = enable_scoring.schema();
|
||||
let field_entry = schema.get_field_entry(self.term.field());
|
||||
if !field_entry.is_indexed()
|
||||
&& field_entry.is_fast()
|
||||
&& is_type_valid_for_fastfield_range_query(self.term.typ())
|
||||
&& !enable_scoring.is_scoring_enabled()
|
||||
{
|
||||
let range_query = RangeQuery::new(
|
||||
Bound::Included(self.term.clone()),
|
||||
Bound::Included(self.term.clone()),
|
||||
);
|
||||
return range_query.weight(enable_scoring);
|
||||
}
|
||||
Ok(Box::new(self.specialized_weight(enable_scoring)?))
|
||||
}
|
||||
fn query_terms<'a>(&'a self, visitor: &mut dyn FnMut(&'a Term, bool)) {
|
||||
|
||||
@@ -25,8 +25,8 @@ impl TermScorer {
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) fn shallow_seek(&mut self, target_doc: DocId) {
|
||||
self.postings.block_cursor.shallow_seek(target_doc);
|
||||
pub(crate) fn seek_block(&mut self, target_doc: DocId) {
|
||||
self.postings.block_cursor.seek_block(target_doc);
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
@@ -175,7 +175,7 @@ mod tests {
|
||||
let fieldnorms: Vec<u32> = std::iter::repeat_n(10u32, 3_000).collect();
|
||||
let mut term_scorer = TermScorer::create_for_test(&doc_and_tfs, &fieldnorms, bm25_weight);
|
||||
assert_eq!(term_scorer.doc(), 0u32);
|
||||
term_scorer.shallow_seek(1289);
|
||||
term_scorer.seek_block(1289);
|
||||
assert_eq!(term_scorer.doc(), 0u32);
|
||||
term_scorer.seek(1289);
|
||||
assert_eq!(term_scorer.doc(), 1290);
|
||||
@@ -242,9 +242,9 @@ mod tests {
|
||||
let bm25_weight = Bm25Weight::for_one_term(10, 129, 20.0);
|
||||
let mut docs = TermScorer::create_for_test(&doc_tfs[..], &fieldnorms[..], bm25_weight);
|
||||
assert_nearly_equals!(docs.block_max_score(), 2.5161593);
|
||||
docs.shallow_seek(135);
|
||||
docs.seek_block(135);
|
||||
assert_nearly_equals!(docs.block_max_score(), 3.4597192);
|
||||
docs.shallow_seek(256);
|
||||
docs.seek_block(256);
|
||||
// the block is not loaded yet.
|
||||
assert_nearly_equals!(docs.block_max_score(), 5.2971773);
|
||||
assert_eq!(256, docs.seek(256));
|
||||
@@ -275,7 +275,7 @@ mod tests {
|
||||
{
|
||||
let mut term_scorer = term_weight.specialized_scorer(reader, 1.0)?;
|
||||
for d in docs {
|
||||
term_scorer.shallow_seek(d);
|
||||
term_scorer.seek_block(d);
|
||||
block_max_scores_b.push(term_scorer.block_max_score());
|
||||
}
|
||||
}
|
||||
|
||||
@@ -5,8 +5,10 @@ use crate::query::score_combiner::{DoNothingCombiner, ScoreCombiner};
|
||||
use crate::query::Scorer;
|
||||
use crate::{DocId, Score};
|
||||
|
||||
const HORIZON_NUM_TINYBITSETS: usize = 64;
|
||||
const HORIZON: u32 = 64u32 * HORIZON_NUM_TINYBITSETS as u32;
|
||||
// The buffered union looks ahead within a fixed-size sliding window
|
||||
// of upcoming document IDs (the "horizon").
|
||||
const HORIZON_NUM_TINYBITSETS: usize = HORIZON as usize / 64;
|
||||
const HORIZON: u32 = 64u32 * 64u32;
|
||||
|
||||
// `drain_filter` is not stable yet.
|
||||
// This function is similar except that it does is not unstable, and
|
||||
@@ -27,12 +29,26 @@ where P: FnMut(&mut T) -> bool {
|
||||
|
||||
/// Creates a `DocSet` that iterate through the union of two or more `DocSet`s.
|
||||
pub struct BufferedUnionScorer<TScorer, TScoreCombiner = DoNothingCombiner> {
|
||||
/// Active scorers (already filtered of `TERMINATED`).
|
||||
docsets: Vec<TScorer>,
|
||||
/// Sliding window presence map for upcoming docs.
|
||||
///
|
||||
/// There are `HORIZON_NUM_TINYBITSETS` buckets, each covering
|
||||
/// a span of 64 doc IDs. Bucket `i` represents the range
|
||||
/// `[window_start_doc + i*64, window_start_doc + (i+1)*64)`.
|
||||
bitsets: Box<[TinySet; HORIZON_NUM_TINYBITSETS]>,
|
||||
// Index of the current TinySet bucket within the sliding window.
|
||||
bucket_idx: usize,
|
||||
/// Per-doc score combiners for the current window.
|
||||
///
|
||||
/// these accumulators merge contributions from all scorers that
|
||||
/// hit the same doc within the buffered window.
|
||||
scores: Box<[TScoreCombiner; HORIZON as usize]>,
|
||||
cursor: usize,
|
||||
offset: DocId,
|
||||
/// Start doc ID (inclusive) of the current sliding window.
|
||||
window_start_doc: DocId,
|
||||
/// Current doc ID of the union.
|
||||
doc: DocId,
|
||||
/// Combined score for current `doc` as produced by `TScoreCombiner`.
|
||||
score: Score,
|
||||
}
|
||||
|
||||
@@ -74,8 +90,8 @@ impl<TScorer: Scorer, TScoreCombiner: ScoreCombiner> BufferedUnionScorer<TScorer
|
||||
docsets: non_empty_docsets,
|
||||
bitsets: Box::new([TinySet::empty(); HORIZON_NUM_TINYBITSETS]),
|
||||
scores: Box::new([score_combiner_fn(); HORIZON as usize]),
|
||||
cursor: HORIZON_NUM_TINYBITSETS,
|
||||
offset: 0,
|
||||
bucket_idx: HORIZON_NUM_TINYBITSETS,
|
||||
window_start_doc: 0,
|
||||
doc: 0,
|
||||
score: 0.0,
|
||||
};
|
||||
@@ -89,8 +105,10 @@ impl<TScorer: Scorer, TScoreCombiner: ScoreCombiner> BufferedUnionScorer<TScorer
|
||||
|
||||
fn refill(&mut self) -> bool {
|
||||
if let Some(min_doc) = self.docsets.iter().map(DocSet::doc).min() {
|
||||
self.offset = min_doc;
|
||||
self.cursor = 0;
|
||||
// Reset the sliding window to start at the smallest doc
|
||||
// across all scorers and prebuffer within the horizon.
|
||||
self.window_start_doc = min_doc;
|
||||
self.bucket_idx = 0;
|
||||
self.doc = min_doc;
|
||||
refill(
|
||||
&mut self.docsets,
|
||||
@@ -105,16 +123,16 @@ impl<TScorer: Scorer, TScoreCombiner: ScoreCombiner> BufferedUnionScorer<TScorer
|
||||
}
|
||||
|
||||
fn advance_buffered(&mut self) -> bool {
|
||||
while self.cursor < HORIZON_NUM_TINYBITSETS {
|
||||
if let Some(val) = self.bitsets[self.cursor].pop_lowest() {
|
||||
let delta = val + (self.cursor as u32) * 64;
|
||||
self.doc = self.offset + delta;
|
||||
while self.bucket_idx < HORIZON_NUM_TINYBITSETS {
|
||||
if let Some(val) = self.bitsets[self.bucket_idx].pop_lowest() {
|
||||
let delta = val + (self.bucket_idx as u32) * 64;
|
||||
self.doc = self.window_start_doc + delta;
|
||||
let score_combiner = &mut self.scores[delta as usize];
|
||||
self.score = score_combiner.score();
|
||||
score_combiner.clear();
|
||||
return true;
|
||||
} else {
|
||||
self.cursor += 1;
|
||||
self.bucket_idx += 1;
|
||||
}
|
||||
}
|
||||
false
|
||||
@@ -144,19 +162,19 @@ where
|
||||
if self.doc >= target {
|
||||
return self.doc;
|
||||
}
|
||||
let gap = target - self.offset;
|
||||
let gap = target - self.window_start_doc;
|
||||
if gap < HORIZON {
|
||||
// Our value is within the buffered horizon.
|
||||
|
||||
// Skipping to corresponding bucket.
|
||||
let new_cursor = gap as usize / 64;
|
||||
for obsolete_tinyset in &mut self.bitsets[self.cursor..new_cursor] {
|
||||
// Skipping to corresponding bucket.
|
||||
let new_bucket_idx = gap as usize / 64;
|
||||
for obsolete_tinyset in &mut self.bitsets[self.bucket_idx..new_bucket_idx] {
|
||||
obsolete_tinyset.clear();
|
||||
}
|
||||
for score_combiner in &mut self.scores[self.cursor * 64..new_cursor * 64] {
|
||||
for score_combiner in &mut self.scores[self.bucket_idx * 64..new_bucket_idx * 64] {
|
||||
score_combiner.clear();
|
||||
}
|
||||
self.cursor = new_cursor;
|
||||
self.bucket_idx = new_bucket_idx;
|
||||
|
||||
// Advancing until we reach the end of the bucket
|
||||
// or we reach a doc greater or equal to the target.
|
||||
@@ -211,7 +229,7 @@ where
|
||||
if self.doc == TERMINATED {
|
||||
return 0;
|
||||
}
|
||||
let mut count = self.bitsets[self.cursor..HORIZON_NUM_TINYBITSETS]
|
||||
let mut count = self.bitsets[self.bucket_idx..HORIZON_NUM_TINYBITSETS]
|
||||
.iter()
|
||||
.map(|bitset| bitset.len())
|
||||
.sum::<u32>()
|
||||
@@ -225,7 +243,7 @@ where
|
||||
bitset.clear();
|
||||
}
|
||||
}
|
||||
self.cursor = HORIZON_NUM_TINYBITSETS;
|
||||
self.bucket_idx = HORIZON_NUM_TINYBITSETS;
|
||||
count
|
||||
}
|
||||
}
|
||||
|
||||
@@ -80,6 +80,7 @@
|
||||
//! }
|
||||
//!
|
||||
//! /// Our custom iterator just helps us to avoid some messy generics.
|
||||
//! #[allow(dead_code)]
|
||||
//! pub struct MyCustomIter<'a>(btree_map::Iter<'a, Field, serde_json::Value>);
|
||||
//! impl<'a> Iterator for MyCustomIter<'a> {
|
||||
//! // Here we can see our field-value pairs being produced by the iterator.
|
||||
|
||||
@@ -1561,7 +1561,6 @@ fn to_ascii(text: &str, output: &mut String) {
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use std::iter;
|
||||
|
||||
use super::to_ascii;
|
||||
use crate::tokenizer::{AsciiFoldingFilter, RawTokenizer, SimpleTokenizer, TextAnalyzer};
|
||||
|
||||
@@ -308,10 +308,9 @@ impl<TSSTable: SSTable> Dictionary<TSSTable> {
|
||||
}
|
||||
}
|
||||
_ => {
|
||||
return Err(io::Error::new(
|
||||
io::ErrorKind::Other,
|
||||
format!("Unsupported sstable version, expected one of [2, 3], found {version}"),
|
||||
));
|
||||
return Err(io::Error::other(format!(
|
||||
"Unsupported sstable version, expected one of [2, 3], found {version}"
|
||||
)));
|
||||
}
|
||||
};
|
||||
|
||||
@@ -609,12 +608,12 @@ impl<TSSTable: SSTable> Dictionary<TSSTable> {
|
||||
|
||||
/// Returns a range builder, to stream all of the terms
|
||||
/// within an interval.
|
||||
pub fn range(&self) -> StreamerBuilder<TSSTable> {
|
||||
pub fn range(&self) -> StreamerBuilder<'_, TSSTable> {
|
||||
StreamerBuilder::new(self, AlwaysMatch)
|
||||
}
|
||||
|
||||
/// Returns a range builder filtered with a prefix.
|
||||
pub fn prefix_range<K: AsRef<[u8]>>(&self, prefix: K) -> StreamerBuilder<TSSTable> {
|
||||
pub fn prefix_range<K: AsRef<[u8]>>(&self, prefix: K) -> StreamerBuilder<'_, TSSTable> {
|
||||
let lower_bound = prefix.as_ref();
|
||||
let mut upper_bound = lower_bound.to_vec();
|
||||
for idx in (0..upper_bound.len()).rev() {
|
||||
@@ -633,7 +632,7 @@ impl<TSSTable: SSTable> Dictionary<TSSTable> {
|
||||
}
|
||||
|
||||
/// A stream of all the sorted terms.
|
||||
pub fn stream(&self) -> io::Result<Streamer<TSSTable>> {
|
||||
pub fn stream(&self) -> io::Result<Streamer<'_, TSSTable>> {
|
||||
self.range().into_stream()
|
||||
}
|
||||
|
||||
|
||||
@@ -54,14 +54,14 @@ pub fn merge_sstable<SST: SSTable, W: io::Write, M: ValueMerger<SST::Value>>(
|
||||
}
|
||||
}
|
||||
for _ in 0..len - 1 {
|
||||
if let Some(mut head) = heap.peek_mut() {
|
||||
if head.0.key() == writer.last_inserted_key() {
|
||||
value_merger.add(head.0.value());
|
||||
if !head.0.advance()? {
|
||||
PeekMut::pop(head);
|
||||
}
|
||||
continue;
|
||||
if let Some(mut head) = heap.peek_mut()
|
||||
&& head.0.key() == writer.last_inserted_key()
|
||||
{
|
||||
value_merger.add(head.0.value());
|
||||
if !head.0.advance()? {
|
||||
PeekMut::pop(head);
|
||||
}
|
||||
continue;
|
||||
}
|
||||
break;
|
||||
}
|
||||
|
||||
@@ -394,7 +394,7 @@ impl SSTableIndexBuilder {
|
||||
|
||||
fn fst_error_to_io_error(error: tantivy_fst::Error) -> io::Error {
|
||||
match error {
|
||||
tantivy_fst::Error::Fst(fst_error) => io::Error::new(io::ErrorKind::Other, fst_error),
|
||||
tantivy_fst::Error::Fst(fst_error) => io::Error::other(fst_error),
|
||||
tantivy_fst::Error::Io(ioerror) => ioerror,
|
||||
}
|
||||
}
|
||||
@@ -438,7 +438,7 @@ impl BlockAddrBlockMetadata {
|
||||
let ordinal_addr = range_start_addr + self.range_start_nbits as usize;
|
||||
let range_end_addr = range_start_addr + num_bits;
|
||||
|
||||
if (range_end_addr + self.range_start_nbits as usize + 7) / 8 > data.len() {
|
||||
if (range_end_addr + self.range_start_nbits as usize).div_ceil(8) > data.len() {
|
||||
return None;
|
||||
}
|
||||
|
||||
|
||||
@@ -274,13 +274,12 @@ impl SharedArenaHashMap {
|
||||
let kv: KeyValue = self.table[bucket];
|
||||
if kv.is_empty() {
|
||||
return None;
|
||||
} else if kv.hash == hash {
|
||||
if let Some(val_addr) =
|
||||
} else if kv.hash == hash
|
||||
&& let Some(val_addr) =
|
||||
self.get_value_addr_if_key_match(key, kv.key_value_addr, memory_arena)
|
||||
{
|
||||
let v = memory_arena.read(val_addr);
|
||||
return Some(v);
|
||||
}
|
||||
{
|
||||
let v = memory_arena.read(val_addr);
|
||||
return Some(v);
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -334,15 +333,14 @@ impl SharedArenaHashMap {
|
||||
self.set_bucket(hash, key_addr, bucket);
|
||||
return val;
|
||||
}
|
||||
if kv.hash == hash {
|
||||
if let Some(val_addr) =
|
||||
if kv.hash == hash
|
||||
&& let Some(val_addr) =
|
||||
self.get_value_addr_if_key_match(key, kv.key_value_addr, memory_arena)
|
||||
{
|
||||
let v = memory_arena.read(val_addr);
|
||||
let new_v = updater(Some(v));
|
||||
memory_arena.write_at(val_addr, new_v);
|
||||
return new_v;
|
||||
}
|
||||
{
|
||||
let v = memory_arena.read(val_addr);
|
||||
let new_v = updater(Some(v));
|
||||
memory_arena.write_at(val_addr, new_v);
|
||||
return new_v;
|
||||
}
|
||||
// This allows fetching the next bucket before the loop jmp
|
||||
bucket = probe.next_probe();
|
||||
|
||||
Reference in New Issue
Block a user