Compare commits

..

1 Commits

Author SHA1 Message Date
Pascal Seitz
b4139bc353 chore: Release 2025-08-20 18:21:53 +08:00
53 changed files with 276 additions and 1399 deletions

View File

@@ -14,18 +14,6 @@ Tantivy 0.25
- Support mixed field types in query parser [#2676](https://github.com/quickwit-oss/tantivy/pull/2676)(@trinity-1686a)
- Add per-field size details [#2679](https://github.com/quickwit-oss/tantivy/pull/2679)(@fulmicoton)
Tantivy 0.24.2
================================
- Fix TopNComputer for reverse order. [#2672](https://github.com/quickwit-oss/tantivy/pull/2672)(@stuhood @PSeitz)
Affected queries are [order_by_fast_field](https://docs.rs/tantivy/latest/tantivy/collector/struct.TopDocs.html#method.order_by_fast_field) and
[order_by_u64_field](https://docs.rs/tantivy/latest/tantivy/collector/struct.TopDocs.html#method.order_by_u64_field)
for `Order::Asc`
Tantivy 0.24.1
================================
- Fix: bump required rust version to 1.81
Tantivy 0.24
================================
Tantivy 0.24 will be backwards compatible with indices created with v0.22 and v0.21. The new minimum rust version will be 1.75. Tantivy 0.23 will be skipped.
@@ -108,14 +96,6 @@ This will slightly increase space and access time. [#2439](https://github.com/qu
- Fix trait bound of StoreReader::iter [#2360](https://github.com/quickwit-oss/tantivy/pull/2360)(@adamreichold)
- remove read_postings_no_deletes [#2526](https://github.com/quickwit-oss/tantivy/pull/2526)(@PSeitz)
Tantivy 0.22.1
================================
- Fix TopNComputer for reverse order. [#2672](https://github.com/quickwit-oss/tantivy/pull/2672)(@stuhood @PSeitz)
Affected queries are [order_by_fast_field](https://docs.rs/tantivy/latest/tantivy/collector/struct.TopDocs.html#method.order_by_fast_field) and
[order_by_u64_field](https://docs.rs/tantivy/latest/tantivy/collector/struct.TopDocs.html#method.order_by_u64_field)
for `Order::Asc`
Tantivy 0.22
================================

View File

@@ -167,12 +167,3 @@ harness = false
[[bench]]
name = "agg_bench"
harness = false
[[bench]]
name = "exists_json"
harness = false
[[bench]]
name = "and_or_queries"
harness = false

View File

@@ -1,4 +1,4 @@
# Releasing a new Tantivy Version
# Release a new Tantivy Version
## Steps
@@ -10,29 +10,12 @@
6. Set git tag with new version
[`cargo-release`](https://github.com/crate-ci/cargo-release) will help us with steps 1-5:
In conjucation with `cargo-release` Steps 1-4 (I'm not sure if the change detection works):
Set new packages to version 0.0.0
Replace prev-tag-name
```bash
cargo release --workspace --no-publish -v --prev-tag-name 0.24 --push-remote origin minor --no-tag
cargo release --workspace --no-publish -v --prev-tag-name 0.19 --push-remote origin minor --no-tag --execute
```
`no-tag` or it will create tags for all the subpackages
cargo release will _not_ ignore unchanged packages, but it will print warnings for them.
e.g. "warning: updating ownedbytes to 0.10.0 despite no changes made since tag 0.24"
We need to manually ignore these unchanged packages
```bash
cargo release --workspace --no-publish -v --prev-tag-name 0.24 --push-remote origin minor --no-tag --exclude tokenizer-api
```
Add `--execute` to actually publish the packages, otherwise it will only print the commands that would be run.
### Tag Version
```bash
git tag 0.25.0
git push upstream tag 0.25.0
```
no-tag or it will create tags for all the subpackages

View File

@@ -1,224 +0,0 @@
// Benchmarks boolean conjunction queries using binggan.
//
// Whats measured:
// - Or and And queries with varying selectivity (only `Term` queries for now on leafs)
// - Nested AND/OR combinations (on multiple fields)
// - No-scoring path using the Count collector (focus on iterator/skip performance)
// - Top-K retrieval (k=10) using the TopDocs collector
//
// Corpus model:
// - Synthetic docs; each token a/b/c is independently included per doc
// - If none of a/b/c are included, emit a neutral filler token to keep doc length similar
//
// Notes:
// - After optimization, when scoring is disabled Tantivy reads doc-only postings
// (IndexRecordOption::Basic), avoiding frequency decoding overhead.
// - This bench isolates boolean iteration speed and intersection/union cost.
// - Use `cargo bench --bench boolean_conjunction` to run.
use binggan::{black_box, BenchRunner};
use rand::prelude::*;
use rand::rngs::StdRng;
use rand::SeedableRng;
use tantivy::collector::{Count, TopDocs};
use tantivy::query::QueryParser;
use tantivy::schema::{Schema, TEXT};
use tantivy::{doc, Index, ReloadPolicy, Searcher};
#[derive(Clone)]
struct BenchIndex {
#[allow(dead_code)]
index: Index,
searcher: Searcher,
query_parser: QueryParser,
}
impl BenchIndex {
#[inline(always)]
fn count_query(&self, query_str: &str) -> usize {
let query = self.query_parser.parse_query(query_str).unwrap();
self.searcher.search(&query, &Count).unwrap()
}
#[inline(always)]
fn topk_len(&self, query_str: &str, k: usize) -> usize {
let query = self.query_parser.parse_query(query_str).unwrap();
self.searcher
.search(&query, &TopDocs::with_limit(k))
.unwrap()
.len()
}
}
/// Build a single index containing both fields (title, body) and
/// return two BenchIndex views:
/// - single_field: QueryParser defaults to only "body"
/// - multi_field: QueryParser defaults to ["title", "body"]
fn build_shared_indices(num_docs: usize, p_a: f32, p_b: f32, p_c: f32) -> (BenchIndex, BenchIndex) {
// Unified schema (two text fields)
let mut schema_builder = Schema::builder();
let f_title = schema_builder.add_text_field("title", TEXT);
let f_body = schema_builder.add_text_field("body", TEXT);
let schema = schema_builder.build();
let index = Index::create_in_ram(schema.clone());
// Populate index with stable RNG for reproducibility.
let mut rng = StdRng::from_seed([7u8; 32]);
// Populate: spread each present token 90/10 to body/title
{
let mut writer = index.writer(500_000_000).unwrap();
for _ in 0..num_docs {
let has_a = rng.gen_bool(p_a as f64);
let has_b = rng.gen_bool(p_b as f64);
let has_c = rng.gen_bool(p_c as f64);
let mut title_tokens: Vec<&str> = Vec::new();
let mut body_tokens: Vec<&str> = Vec::new();
if has_a {
if rng.gen_bool(0.1) {
title_tokens.push("a");
} else {
body_tokens.push("a");
}
}
if has_b {
if rng.gen_bool(0.1) {
title_tokens.push("b");
} else {
body_tokens.push("b");
}
}
if has_c {
if rng.gen_bool(0.1) {
title_tokens.push("c");
} else {
body_tokens.push("c");
}
}
if title_tokens.is_empty() && body_tokens.is_empty() {
body_tokens.push("z");
}
writer
.add_document(doc!(
f_title=>title_tokens.join(" "),
f_body=>body_tokens.join(" ")
))
.unwrap();
}
writer.commit().unwrap();
}
// Prepare reader/searcher once.
let reader = index
.reader_builder()
.reload_policy(ReloadPolicy::Manual)
.try_into()
.unwrap();
let searcher = reader.searcher();
// Build two query parsers with different default fields.
let qp_single = QueryParser::for_index(&index, vec![f_body]);
let qp_multi = QueryParser::for_index(&index, vec![f_title, f_body]);
let single_view = BenchIndex {
index: index.clone(),
searcher: searcher.clone(),
query_parser: qp_single,
};
let multi_view = BenchIndex {
index,
searcher,
query_parser: qp_multi,
};
(single_view, multi_view)
}
fn main() {
// Prepare corpora with varying selectivity. Build one index per corpus
// and derive two views (single-field vs multi-field) from it.
let scenarios = vec![
(
"N=1M, p(a)=5%, p(b)=1%, p(c)=15%".to_string(),
1_000_000,
0.05,
0.01,
0.15,
),
(
"N=1M, p(a)=1%, p(b)=1%, p(c)=15%".to_string(),
1_000_000,
0.01,
0.01,
0.15,
),
];
let mut runner = BenchRunner::new();
for (label, n, pa, pb, pc) in scenarios {
let (single_view, multi_view) = build_shared_indices(n, pa, pb, pc);
// Single-field group: default field is body only
{
let mut group = runner.new_group();
group.set_name(format!("single_field — {}", label));
group.register_with_input("+a_+b_count", &single_view, |benv: &BenchIndex| {
black_box(benv.count_query("+a +b"))
});
group.register_with_input("+a_+b_+c_count", &single_view, |benv: &BenchIndex| {
black_box(benv.count_query("+a +b +c"))
});
group.register_with_input("+a_+b_top10", &single_view, |benv: &BenchIndex| {
black_box(benv.topk_len("+a +b", 10))
});
group.register_with_input("+a_+b_+c_top10", &single_view, |benv: &BenchIndex| {
black_box(benv.topk_len("+a +b +c", 10))
});
// OR queries
group.register_with_input("a_OR_b_count", &single_view, |benv: &BenchIndex| {
black_box(benv.count_query("a OR b"))
});
group.register_with_input("a_OR_b_OR_c_count", &single_view, |benv: &BenchIndex| {
black_box(benv.count_query("a OR b OR c"))
});
group.register_with_input("a_OR_b_top10", &single_view, |benv: &BenchIndex| {
black_box(benv.topk_len("a OR b", 10))
});
group.register_with_input("a_OR_b_OR_c_top10", &single_view, |benv: &BenchIndex| {
black_box(benv.topk_len("a OR b OR c", 10))
});
group.run();
}
// Multi-field group: default fields are [title, body]
{
let mut group = runner.new_group();
group.set_name(format!("multi_field — {}", label));
group.register_with_input("+a_+b_count", &multi_view, |benv: &BenchIndex| {
black_box(benv.count_query("+a +b"))
});
group.register_with_input("+a_+b_+c_count", &multi_view, |benv: &BenchIndex| {
black_box(benv.count_query("+a +b +c"))
});
group.register_with_input("+a_+b_top10", &multi_view, |benv: &BenchIndex| {
black_box(benv.topk_len("+a +b", 10))
});
group.register_with_input("+a_+b_+c_top10", &multi_view, |benv: &BenchIndex| {
black_box(benv.topk_len("+a +b +c", 10))
});
// OR queries
group.register_with_input("a_OR_b_count", &multi_view, |benv: &BenchIndex| {
black_box(benv.count_query("a OR b"))
});
group.register_with_input("a_OR_b_OR_c_count", &multi_view, |benv: &BenchIndex| {
black_box(benv.count_query("a OR b OR c"))
});
group.register_with_input("a_OR_b_top10", &multi_view, |benv: &BenchIndex| {
black_box(benv.topk_len("a OR b", 10))
});
group.register_with_input("a_OR_b_OR_c_top10", &multi_view, |benv: &BenchIndex| {
black_box(benv.topk_len("a OR b OR c", 10))
});
group.run();
}
}
}

View File

@@ -1,69 +0,0 @@
use binggan::plugins::PeakMemAllocPlugin;
use binggan::{black_box, InputGroup, PeakMemAlloc, INSTRUMENTED_SYSTEM};
use serde_json::json;
use tantivy::collector::Count;
use tantivy::query::ExistsQuery;
use tantivy::schema::{Schema, FAST, TEXT};
use tantivy::{doc, Index};
#[global_allocator]
pub static GLOBAL: &PeakMemAlloc<std::alloc::System> = &INSTRUMENTED_SYSTEM;
fn main() {
let doc_count: usize = 500_000;
let subfield_counts: &[usize] = &[1, 2, 3, 4, 5, 6, 7, 8, 16, 256, 4096, 65536, 262144];
let indices: Vec<(String, Index)> = subfield_counts
.iter()
.map(|&sub_fields| {
(
format!("subfields={sub_fields}"),
build_index_with_json_subfields(doc_count, sub_fields),
)
})
.collect();
let mut group = InputGroup::new_with_inputs(indices);
group.add_plugin(PeakMemAllocPlugin::new(GLOBAL));
group.config().num_iter_group = Some(1);
group.config().num_iter_bench = Some(1);
group.register("exists_json", exists_json_union);
group.run();
}
fn exists_json_union(index: &Index) {
let reader = index.reader().expect("reader");
let searcher = reader.searcher();
let query = ExistsQuery::new("json".to_string(), true);
let count = searcher.search(&query, &Count).expect("exists search");
// Prevents optimizer from eliding the search
black_box(count);
}
fn build_index_with_json_subfields(num_docs: usize, num_subfields: usize) -> Index {
// Schema: single JSON field stored as FAST to support ExistsQuery.
let mut schema_builder = Schema::builder();
let json_field = schema_builder.add_json_field("json", TEXT | FAST);
let schema = schema_builder.build();
let index = Index::create_from_tempdir(schema).expect("create index");
{
let mut index_writer = index
.writer_with_num_threads(1, 200_000_000)
.expect("writer");
for i in 0..num_docs {
let sub = i % num_subfields;
// Only one subpath set per document; rotate subpaths so that
// no single subpath is full, but the union covers all docs.
let v = json!({ format!("field_{sub}"): i as u64 });
index_writer
.add_document(doc!(json_field => v))
.expect("add_document");
}
index_writer.commit().expect("commit");
}
index
}

View File

@@ -48,7 +48,7 @@ impl BitPacker {
pub fn flush<TWrite: io::Write + ?Sized>(&mut self, output: &mut TWrite) -> io::Result<()> {
if self.mini_buffer_written > 0 {
let num_bytes = self.mini_buffer_written.div_ceil(8);
let num_bytes = (self.mini_buffer_written + 7) / 8;
let bytes = self.mini_buffer.to_le_bytes();
output.write_all(&bytes[..num_bytes])?;
self.mini_buffer_written = 0;
@@ -138,7 +138,7 @@ impl BitUnpacker {
// We use `usize` here to avoid overflow issues.
let end_bit_read = (end_idx as usize) * self.num_bits;
let end_byte_read = end_bit_read.div_ceil(8);
let end_byte_read = (end_bit_read + 7) / 8;
assert!(
end_byte_read <= data.len(),
"Requested index is out of bounds."

View File

@@ -140,10 +140,10 @@ impl BlockedBitpacker {
pub fn iter(&self) -> impl Iterator<Item = u64> + '_ {
// todo performance: we could decompress a whole block and cache it instead
let bitpacked_elems = self.offset_and_bits.len() * BLOCK_SIZE;
(0..bitpacked_elems)
let iter = (0..bitpacked_elems)
.map(move |idx| self.get(idx))
.chain(self.buffer.iter().cloned())
.chain(self.buffer.iter().cloned());
iter
}
}

View File

@@ -56,7 +56,7 @@ fn get_doc_ids_with_values<'a>(
ColumnIndex::Full => Box::new(doc_range),
ColumnIndex::Optional(optional_index) => Box::new(
optional_index
.iter_non_null_docs()
.iter_docs()
.map(move |row| row + doc_range.start),
),
ColumnIndex::Multivalued(multivalued_index) => match multivalued_index {
@@ -73,7 +73,7 @@ fn get_doc_ids_with_values<'a>(
MultiValueIndex::MultiValueIndexV2(multivalued_index) => Box::new(
multivalued_index
.optional_index
.iter_non_null_docs()
.iter_docs()
.map(move |row| row + doc_range.start),
),
},
@@ -105,11 +105,10 @@ fn get_num_values_iterator<'a>(
) -> Box<dyn Iterator<Item = u32> + 'a> {
match column_index {
ColumnIndex::Empty { .. } => Box::new(std::iter::empty()),
ColumnIndex::Full => Box::new(std::iter::repeat_n(1u32, num_docs as usize)),
ColumnIndex::Optional(optional_index) => Box::new(std::iter::repeat_n(
1u32,
optional_index.num_non_nulls() as usize,
)),
ColumnIndex::Full => Box::new(std::iter::repeat(1u32).take(num_docs as usize)),
ColumnIndex::Optional(optional_index) => {
Box::new(std::iter::repeat(1u32).take(optional_index.num_non_nulls() as usize))
}
ColumnIndex::Multivalued(multivalued_index) => Box::new(
multivalued_index
.get_start_index_column()
@@ -178,7 +177,7 @@ impl<'a> Iterable<RowId> for StackedOptionalIndex<'a> {
ColumnIndex::Full => Box::new(columnar_row_range),
ColumnIndex::Optional(optional_index) => Box::new(
optional_index
.iter_non_null_docs()
.iter_docs()
.map(move |row_id: RowId| columnar_row_range.start + row_id),
),
ColumnIndex::Multivalued(_) => {

View File

@@ -215,32 +215,6 @@ impl MultiValueIndex {
}
}
/// Returns an iterator over document ids that have at least one value.
pub fn iter_non_null_docs(&self) -> Box<dyn Iterator<Item = DocId> + '_> {
match self {
MultiValueIndex::MultiValueIndexV1(idx) => {
let mut doc: DocId = 0u32;
let num_docs = idx.num_docs();
Box::new(std::iter::from_fn(move || {
// This is not the most efficient way to do this, but it's legacy code.
while doc < num_docs {
let cur = doc;
doc += 1;
let start = idx.start_index_column.get_val(cur);
let end = idx.start_index_column.get_val(cur + 1);
if end > start {
return Some(cur);
}
}
None
}))
}
MultiValueIndex::MultiValueIndexV2(idx) => {
Box::new(idx.optional_index.iter_non_null_docs())
}
}
}
/// Converts a list of ranks (row ids of values) in a 1:n index to the corresponding list of
/// docids. Positions are converted inplace to docids.
///

View File

@@ -1,4 +1,4 @@
use std::io;
use std::io::{self, Write};
use std::sync::Arc;
mod set;
@@ -11,7 +11,7 @@ use set_block::{
};
use crate::iterable::Iterable;
use crate::{DocId, RowId};
use crate::{DocId, InvalidData, RowId};
/// The threshold for for number of elements after which we switch to dense block encoding.
///
@@ -88,7 +88,7 @@ pub struct OptionalIndex {
impl Iterable<u32> for &OptionalIndex {
fn boxed_iter(&self) -> Box<dyn Iterator<Item = u32> + '_> {
Box::new(self.iter_non_null_docs())
Box::new(self.iter_docs())
}
}
@@ -280,9 +280,8 @@ impl OptionalIndex {
self.num_non_null_docs
}
pub fn iter_non_null_docs(&self) -> impl Iterator<Item = RowId> + '_ {
// TODO optimize. We could iterate over the blocks directly.
// We use the dense value ids and retrieve the doc ids via select.
pub fn iter_docs(&self) -> impl Iterator<Item = RowId> + '_ {
// TODO optimize
let mut select_batch = self.select_cursor();
(0..self.num_non_null_docs).map(move |rank| select_batch.select(rank))
}
@@ -335,6 +334,38 @@ enum Block<'a> {
Sparse(SparseBlock<'a>),
}
#[derive(Debug, Copy, Clone)]
enum OptionalIndexCodec {
Dense = 0,
Sparse = 1,
}
impl OptionalIndexCodec {
fn to_code(self) -> u8 {
self as u8
}
fn try_from_code(code: u8) -> Result<Self, InvalidData> {
match code {
0 => Ok(Self::Dense),
1 => Ok(Self::Sparse),
_ => Err(InvalidData),
}
}
}
impl BinarySerializable for OptionalIndexCodec {
fn serialize<W: Write + ?Sized>(&self, writer: &mut W) -> io::Result<()> {
writer.write_all(&[self.to_code()])
}
fn deserialize<R: io::Read>(reader: &mut R) -> io::Result<Self> {
let optional_codec_code = u8::deserialize(reader)?;
let optional_codec = Self::try_from_code(optional_codec_code)?;
Ok(optional_codec)
}
}
fn serialize_optional_index_block(block_els: &[u16], out: &mut impl io::Write) -> io::Result<()> {
let is_sparse = is_sparse(block_els.len() as u32);
if is_sparse {

View File

@@ -164,11 +164,7 @@ fn test_optional_index_large() {
fn test_optional_index_iter_aux(row_ids: &[RowId], num_rows: RowId) {
let optional_index = OptionalIndex::for_test(num_rows, row_ids);
assert_eq!(optional_index.num_docs(), num_rows);
assert!(
optional_index
.iter_non_null_docs()
.eq(row_ids.iter().copied())
);
assert!(optional_index.iter_docs().eq(row_ids.iter().copied()));
}
#[test]

View File

@@ -185,10 +185,10 @@ impl CompactSpaceBuilder {
let mut covered_space = Vec::with_capacity(self.blanks.len());
// beginning of the blanks
if let Some(first_blank_start) = self.blanks.first().map(RangeInclusive::start)
&& *first_blank_start != 0
{
covered_space.push(0..=first_blank_start - 1);
if let Some(first_blank_start) = self.blanks.first().map(RangeInclusive::start) {
if *first_blank_start != 0 {
covered_space.push(0..=first_blank_start - 1);
}
}
// Between the blanks
@@ -202,10 +202,10 @@ impl CompactSpaceBuilder {
covered_space.extend(between_blanks);
// end of the blanks
if let Some(last_blank_end) = self.blanks.last().map(RangeInclusive::end)
&& *last_blank_end != u128::MAX
{
covered_space.push(last_blank_end + 1..=u128::MAX);
if let Some(last_blank_end) = self.blanks.last().map(RangeInclusive::end) {
if *last_blank_end != u128::MAX {
covered_space.push(last_blank_end + 1..=u128::MAX);
}
}
if covered_space.is_empty() {

View File

@@ -105,7 +105,7 @@ impl ColumnCodecEstimator for BitpackedCodecEstimator {
fn estimate(&self, stats: &ColumnStats) -> Option<u64> {
let num_bits_per_value = num_bits(stats);
Some(stats.num_bytes() + (stats.num_rows as u64 * (num_bits_per_value as u64)).div_ceil(8))
Some(stats.num_bytes() + (stats.num_rows as u64 * (num_bits_per_value as u64) + 7) / 8)
}
fn serialize(

View File

@@ -117,7 +117,7 @@ impl ColumnCodecEstimator for LinearCodecEstimator {
Some(
stats.num_bytes()
+ linear_params.num_bytes()
+ (num_bits as u64 * stats.num_rows as u64).div_ceil(8),
+ (num_bits as u64 * stats.num_rows as u64 + 7) / 8,
)
}

View File

@@ -367,7 +367,7 @@ fn is_empty_after_merge(
ColumnIndex::Empty { .. } => true,
ColumnIndex::Full => alive_bitset.len() == 0,
ColumnIndex::Optional(optional_index) => {
for doc in optional_index.iter_non_null_docs() {
for doc in optional_index.iter_docs() {
if alive_bitset.contains(doc) {
return false;
}

View File

@@ -244,7 +244,7 @@ impl SymbolValue for UnorderedId {
fn compute_num_bytes_for_u64(val: u64) -> usize {
let msb = (64u32 - val.leading_zeros()) as usize;
msb.div_ceil(8)
(msb + 7) / 8
}
fn encode_zig_zag(n: i64) -> u64 {

View File

@@ -1,5 +1,3 @@
use std::str::FromStr;
use common::DateTime;
use crate::InvalidData;
@@ -11,23 +9,6 @@ pub enum NumericalValue {
F64(f64),
}
impl FromStr for NumericalValue {
type Err = ();
fn from_str(s: &str) -> Result<Self, ()> {
if let Ok(val_i64) = s.parse::<i64>() {
return Ok(val_i64.into());
}
if let Ok(val_u64) = s.parse::<u64>() {
return Ok(val_u64.into());
}
if let Ok(val_f64) = s.parse::<f64>() {
return Ok(NumericalValue::from(val_f64).normalize());
}
Err(())
}
}
impl NumericalValue {
pub fn numerical_type(&self) -> NumericalType {
match self {
@@ -45,7 +26,7 @@ impl NumericalValue {
if val <= i64::MAX as u64 {
NumericalValue::I64(val as i64)
} else {
NumericalValue::U64(val)
NumericalValue::F64(val as f64)
}
}
NumericalValue::I64(val) => NumericalValue::I64(val),
@@ -160,7 +141,6 @@ impl Coerce for DateTime {
#[cfg(test)]
mod tests {
use super::NumericalType;
use crate::NumericalValue;
#[test]
fn test_numerical_type_code() {
@@ -173,58 +153,4 @@ mod tests {
}
assert_eq!(num_numerical_type, 3);
}
#[test]
fn test_parse_numerical() {
assert_eq!(
"123".parse::<NumericalValue>().unwrap(),
NumericalValue::I64(123)
);
assert_eq!(
"18446744073709551615".parse::<NumericalValue>().unwrap(),
NumericalValue::U64(18446744073709551615u64)
);
assert_eq!(
"1.0".parse::<NumericalValue>().unwrap(),
NumericalValue::I64(1i64)
);
assert_eq!(
"1.1".parse::<NumericalValue>().unwrap(),
NumericalValue::F64(1.1f64)
);
assert_eq!(
"-1.0".parse::<NumericalValue>().unwrap(),
NumericalValue::I64(-1i64)
);
}
#[test]
fn test_normalize_numerical() {
assert_eq!(
NumericalValue::from(1u64).normalize(),
NumericalValue::I64(1i64),
);
let limit_val = i64::MAX as u64 + 1u64;
assert_eq!(
NumericalValue::from(limit_val).normalize(),
NumericalValue::U64(limit_val),
);
assert_eq!(
NumericalValue::from(-1i64).normalize(),
NumericalValue::I64(-1i64),
);
assert_eq!(
NumericalValue::from(-2.0f64).normalize(),
NumericalValue::I64(-2i64),
);
assert_eq!(
NumericalValue::from(-2.1f64).normalize(),
NumericalValue::F64(-2.1f64),
);
let large_float = 2.0f64.powf(70.0f64);
assert_eq!(
NumericalValue::from(large_float).normalize(),
NumericalValue::F64(large_float),
);
}
}

View File

@@ -183,7 +183,7 @@ pub struct BitSet {
}
fn num_buckets(max_val: u32) -> u32 {
max_val.div_ceil(64u32)
(max_val + 63u32) / 64u32
}
impl BitSet {

View File

@@ -29,7 +29,6 @@ impl BinarySerializable for VIntU128 {
}
fn deserialize<R: Read>(reader: &mut R) -> io::Result<Self> {
#[allow(clippy::unbuffered_bytes)]
let mut bytes = reader.bytes();
let mut result = 0u128;
let mut shift = 0u64;
@@ -53,7 +52,7 @@ impl BinarySerializable for VIntU128 {
}
}
/// Wrapper over a `u64` that serializes as a variable int.
/// Wrapper over a `u64` that serializes as a variable int.
#[derive(Clone, Copy, Debug, Eq, PartialEq)]
pub struct VInt(pub u64);
@@ -197,7 +196,6 @@ impl BinarySerializable for VInt {
}
fn deserialize<R: Read>(reader: &mut R) -> io::Result<Self> {
#[allow(clippy::unbuffered_bytes)]
let mut bytes = reader.bytes();
let mut result = 0u64;
let mut shift = 0u64;

View File

@@ -15,5 +15,3 @@ edition = "2024"
nom = "7"
serde = { version = "1.0.219", features = ["derive"] }
serde_json = "1.0.140"
ordered-float = "5.0.0"
fnv = "1.0.7"

View File

@@ -117,22 +117,6 @@ where F: nom::Parser<I, (O, ErrorList), Infallible> {
}
}
pub(crate) fn terminated_infallible<I, O1, O2, F, G>(
mut first: F,
mut second: G,
) -> impl FnMut(I) -> JResult<I, O1>
where
F: nom::Parser<I, (O1, ErrorList), Infallible>,
G: nom::Parser<I, (O2, ErrorList), Infallible>,
{
move |input: I| {
let (input, (o1, mut err)) = first.parse(input)?;
let (input, (_, mut err2)) = second.parse(input)?;
err.append(&mut err2);
Ok((input, (o1, err)))
}
}
pub(crate) fn delimited_infallible<I, O1, O2, O3, F, G, H>(
mut first: F,
mut second: G,

View File

@@ -31,17 +31,7 @@ pub fn parse_query_lenient(query: &str) -> (UserInputAst, Vec<LenientError>) {
#[cfg(test)]
mod tests {
use crate::{UserInputAst, parse_query, parse_query_lenient};
#[test]
fn test_deduplication() {
let ast: UserInputAst = parse_query("a a").unwrap();
let json = serde_json::to_string(&ast).unwrap();
assert_eq!(
json,
r#"{"type":"bool","clauses":[[null,{"type":"literal","field_name":null,"phrase":"a","delimiter":"none","slop":0,"prefix":false}]]}"#
);
}
use crate::{parse_query, parse_query_lenient};
#[test]
fn test_parse_query_serialization() {

View File

@@ -1,7 +1,6 @@
use std::borrow::Cow;
use std::iter::once;
use fnv::FnvHashSet;
use nom::IResult;
use nom::branch::alt;
use nom::bytes::complete::tag;
@@ -69,7 +68,7 @@ fn interpret_escape(source: &str) -> String {
/// Consume a word outside of any context.
// TODO should support escape sequences
fn word(inp: &str) -> IResult<&str, Cow<'_, str>> {
fn word(inp: &str) -> IResult<&str, Cow<str>> {
map_res(
recognize(tuple((
alt((
@@ -306,14 +305,15 @@ fn term_group_infallible(inp: &str) -> JResult<&str, UserInputAst> {
let (inp, (field_name, _, _, _)) =
tuple((field_name, multispace0, char('('), multispace0))(inp).expect("precondition failed");
delimited_infallible(
let res = delimited_infallible(
nothing,
map(ast_infallible, |(mut ast, errors)| {
ast.set_default_field(field_name.to_string());
(ast, errors)
}),
opt_i_err(char(')'), "expected ')'"),
)(inp)
)(inp);
res
}
fn exists(inp: &str) -> IResult<&str, UserInputLeaf> {
@@ -367,10 +367,7 @@ fn literal(inp: &str) -> IResult<&str, UserInputAst> {
// something (a field name) got parsed before
alt((
map(
tuple((
opt(field_name),
alt((range, set, exists, regex, term_or_phrase)),
)),
tuple((opt(field_name), alt((range, set, exists, term_or_phrase)))),
|(field_name, leaf): (Option<String>, UserInputLeaf)| leaf.set_field(field_name).into(),
),
term_group,
@@ -392,10 +389,6 @@ fn literal_no_group_infallible(inp: &str) -> JResult<&str, Option<UserInputAst>>
value((), peek(one_of("{[><"))),
map(range_infallible, |(range, errs)| (Some(range), errs)),
),
(
value((), peek(one_of("/"))),
map(regex_infallible, |(regex, errs)| (Some(regex), errs)),
),
),
delimited_infallible(space0_infallible, term_or_phrase_infallible, nothing),
),
@@ -696,61 +689,6 @@ fn set_infallible(mut inp: &str) -> JResult<&str, UserInputLeaf> {
}
}
fn regex(inp: &str) -> IResult<&str, UserInputLeaf> {
map(
terminated(
delimited(
char('/'),
many1(alt((preceded(char('\\'), char('/')), none_of("/")))),
char('/'),
),
peek(alt((multispace1, eof))),
),
|elements| UserInputLeaf::Regex {
field: None,
pattern: elements.into_iter().collect::<String>(),
},
)(inp)
}
fn regex_infallible(inp: &str) -> JResult<&str, UserInputLeaf> {
match terminated_infallible(
delimited_infallible(
opt_i_err(char('/'), "missing delimiter /"),
opt_i(many1(alt((preceded(char('\\'), char('/')), none_of("/"))))),
opt_i_err(char('/'), "missing delimiter /"),
),
opt_i_err(
peek(alt((multispace1, eof))),
"expected whitespace or end of input",
),
)(inp)
{
Ok((rest, (elements_part, errors))) => {
let pattern = match elements_part {
Some(elements_part) => elements_part.into_iter().collect(),
None => String::new(),
};
let res = UserInputLeaf::Regex {
field: None,
pattern,
};
Ok((rest, (res, errors)))
}
Err(e) => {
let errs = vec![LenientErrorInternal {
pos: inp.len(),
message: e.to_string(),
}];
let res = UserInputLeaf::Regex {
field: None,
pattern: String::new(),
};
Ok((inp, (res, errs)))
}
}
}
fn negate(expr: UserInputAst) -> UserInputAst {
expr.unary(Occur::MustNot)
}
@@ -815,7 +753,7 @@ fn boosted_leaf(inp: &str) -> IResult<&str, UserInputAst> {
tuple((leaf, fallible(boost))),
|(leaf, boost_opt)| match boost_opt {
Some(boost) if (boost - 1.0).abs() > f64::EPSILON => {
UserInputAst::Boost(Box::new(leaf), boost.into())
UserInputAst::Boost(Box::new(leaf), boost)
}
_ => leaf,
},
@@ -827,7 +765,7 @@ fn boosted_leaf_infallible(inp: &str) -> JResult<&str, Option<UserInputAst>> {
tuple_infallible((leaf_infallible, boost)),
|((leaf, boost_opt), error)| match boost_opt {
Some(boost) if (boost - 1.0).abs() > f64::EPSILON => (
leaf.map(|leaf| UserInputAst::Boost(Box::new(leaf), boost.into())),
leaf.map(|leaf| UserInputAst::Boost(Box::new(leaf), boost)),
error,
),
_ => (leaf, error),
@@ -1078,25 +1016,12 @@ pub fn parse_to_ast_lenient(query_str: &str) -> (UserInputAst, Vec<LenientError>
(rewrite_ast(res), errors)
}
/// Removes unnecessary children clauses in AST
///
/// Motivated by [issue #1433](https://github.com/quickwit-oss/tantivy/issues/1433)
fn rewrite_ast(mut input: UserInputAst) -> UserInputAst {
if let UserInputAst::Clause(sub_clauses) = &mut input {
// call rewrite_ast recursively on children clauses if applicable
let mut new_clauses = Vec::with_capacity(sub_clauses.len());
for (occur, clause) in sub_clauses.drain(..) {
let rewritten_clause = rewrite_ast(clause);
new_clauses.push((occur, rewritten_clause));
}
*sub_clauses = new_clauses;
// remove duplicate child clauses
// e.g. (+a +b) OR (+c +d) OR (+a +b) => (+a +b) OR (+c +d)
let mut seen = FnvHashSet::default();
sub_clauses.retain(|term| seen.insert(term.clone()));
// Removes unnecessary children clauses in AST
//
// Motivated by [issue #1433](https://github.com/quickwit-oss/tantivy/issues/1433)
for term in sub_clauses {
if let UserInputAst::Clause(terms) = &mut input {
for term in terms {
rewrite_ast_clause(term);
}
}
@@ -1769,63 +1694,6 @@ mod test {
test_is_parse_err(r#"!bc:def"#, "!bc:def");
}
#[test]
fn test_regex_parser() {
let r = parse_to_ast(r#"a:/joh?n(ath[oa]n)/"#);
assert!(r.is_ok(), "Failed to parse custom query: {r:?}");
let (_, input) = r.unwrap();
match input {
UserInputAst::Leaf(leaf) => match leaf.as_ref() {
UserInputLeaf::Regex { field, pattern } => {
assert_eq!(field, &Some("a".to_string()));
assert_eq!(pattern, "joh?n(ath[oa]n)");
}
_ => panic!("Expected a regex leaf, got {leaf:?}"),
},
_ => panic!("Expected a leaf"),
}
let r = parse_to_ast(r#"a:/\\/cgi-bin\\/luci.*/"#);
assert!(r.is_ok(), "Failed to parse custom query: {r:?}");
let (_, input) = r.unwrap();
match input {
UserInputAst::Leaf(leaf) => match leaf.as_ref() {
UserInputLeaf::Regex { field, pattern } => {
assert_eq!(field, &Some("a".to_string()));
assert_eq!(pattern, "\\/cgi-bin\\/luci.*");
}
_ => panic!("Expected a regex leaf, got {leaf:?}"),
},
_ => panic!("Expected a leaf"),
}
}
#[test]
fn test_regex_parser_lenient() {
let literal = |query| literal_infallible(query).unwrap().1;
let (res, errs) = literal(r#"a:/joh?n(ath[oa]n)/"#);
let expected = UserInputLeaf::Regex {
field: Some("a".to_string()),
pattern: "joh?n(ath[oa]n)".to_string(),
}
.into();
assert_eq!(res.unwrap(), expected);
assert!(errs.is_empty(), "Expected no errors, got: {errs:?}");
let (res, errs) = literal("title:/joh?n(ath[oa]n)");
let expected = UserInputLeaf::Regex {
field: Some("title".to_string()),
pattern: "joh?n(ath[oa]n)".to_string(),
}
.into();
assert_eq!(res.unwrap(), expected);
assert_eq!(errs.len(), 1, "Expected 1 error, got: {errs:?}");
assert_eq!(
errs[0].message, "missing delimiter /",
"Unexpected error message",
);
}
#[test]
fn test_space_before_value() {
test_parse_query_to_ast_helper("field : a", r#""field":a"#);

View File

@@ -5,7 +5,7 @@ use serde::Serialize;
use crate::Occur;
#[derive(PartialEq, Eq, Hash, Clone, Serialize)]
#[derive(PartialEq, Clone, Serialize)]
#[serde(tag = "type")]
#[serde(rename_all = "snake_case")]
pub enum UserInputLeaf {
@@ -23,10 +23,6 @@ pub enum UserInputLeaf {
Exists {
field: String,
},
Regex {
field: Option<String>,
pattern: String,
},
}
impl UserInputLeaf {
@@ -50,7 +46,6 @@ impl UserInputLeaf {
UserInputLeaf::Exists { field: _ } => UserInputLeaf::Exists {
field: field.expect("Exist query without a field isn't allowed"),
},
UserInputLeaf::Regex { field: _, pattern } => UserInputLeaf::Regex { field, pattern },
}
}
@@ -108,19 +103,11 @@ impl Debug for UserInputLeaf {
UserInputLeaf::Exists { field } => {
write!(formatter, "$exists(\"{field}\")")
}
UserInputLeaf::Regex { field, pattern } => {
if let Some(field) = field {
// TODO properly escape field (in case of \")
write!(formatter, "\"{field}\":")?;
}
// TODO properly escape pattern (in case of \")
write!(formatter, "/{pattern}/")
}
}
}
}
#[derive(Copy, Clone, Eq, PartialEq, Hash, Debug, Serialize)]
#[derive(Copy, Clone, Eq, PartialEq, Debug, Serialize)]
#[serde(rename_all = "snake_case")]
pub enum Delimiter {
SingleQuotes,
@@ -128,7 +115,7 @@ pub enum Delimiter {
None,
}
#[derive(PartialEq, Eq, Hash, Clone, Serialize)]
#[derive(PartialEq, Clone, Serialize)]
#[serde(rename_all = "snake_case")]
pub struct UserInputLiteral {
pub field_name: Option<String>,
@@ -167,7 +154,7 @@ impl fmt::Debug for UserInputLiteral {
}
}
#[derive(PartialEq, Eq, Hash, Debug, Clone, Serialize)]
#[derive(PartialEq, Debug, Clone, Serialize)]
#[serde(tag = "type", content = "value")]
#[serde(rename_all = "snake_case")]
pub enum UserInputBound {
@@ -204,11 +191,11 @@ impl UserInputBound {
}
}
#[derive(PartialEq, Eq, Hash, Clone, Serialize)]
#[derive(PartialEq, Clone, Serialize)]
#[serde(into = "UserInputAstSerde")]
pub enum UserInputAst {
Clause(Vec<(Option<Occur>, UserInputAst)>),
Boost(Box<UserInputAst>, ordered_float::OrderedFloat<f64>),
Boost(Box<UserInputAst>, f64),
Leaf(Box<UserInputLeaf>),
}
@@ -230,10 +217,9 @@ impl From<UserInputAst> for UserInputAstSerde {
fn from(ast: UserInputAst) -> Self {
match ast {
UserInputAst::Clause(clause) => UserInputAstSerde::Bool { clauses: clause },
UserInputAst::Boost(underlying, boost) => UserInputAstSerde::Boost {
underlying,
boost: boost.into_inner(),
},
UserInputAst::Boost(underlying, boost) => {
UserInputAstSerde::Boost { underlying, boost }
}
UserInputAst::Leaf(leaf) => UserInputAstSerde::Leaf(leaf),
}
}
@@ -392,7 +378,7 @@ mod tests {
#[test]
fn test_boost_serialization() {
let inner_ast = UserInputAst::Leaf(Box::new(UserInputLeaf::All));
let boost_ast = UserInputAst::Boost(Box::new(inner_ast), 2.5.into());
let boost_ast = UserInputAst::Boost(Box::new(inner_ast), 2.5);
let json = serde_json::to_string(&boost_ast).unwrap();
assert_eq!(
json,
@@ -419,7 +405,7 @@ mod tests {
}))),
),
])),
2.5.into(),
2.5,
);
let json = serde_json::to_string(&boost_ast).unwrap();
assert_eq!(

View File

@@ -301,7 +301,7 @@ impl SegmentAggregationCollector for SegmentHistogramCollector {
let bounds = self.bounds;
let interval = self.interval;
let offset = self.offset;
let get_bucket_pos = |val| get_bucket_pos_f64(val, interval, offset) as i64;
let get_bucket_pos = |val| (get_bucket_pos_f64(val, interval, offset) as i64);
bucket_agg_accessor
.column_block_accessor

View File

@@ -484,6 +484,7 @@ impl FacetCounts {
#[cfg(test)]
mod tests {
use std::collections::BTreeSet;
use std::iter;
use columnar::Dictionary;
use rand::distributions::Uniform;

View File

@@ -1,4 +1,3 @@
use columnar::NumericalValue;
use common::json_path_writer::{JSON_END_OF_PATH, JSON_PATH_SEGMENT_SEP};
use common::{replace_in_place, JsonPathWriter};
use rustc_hash::FxHashMap;
@@ -153,7 +152,7 @@ pub(crate) fn index_json_value<'a, V: Value<'a>>(
if let Ok(i64_val) = val.try_into() {
term_buffer.append_type_and_fast_value::<i64>(i64_val);
} else {
term_buffer.append_type_and_fast_value::<u64>(val);
term_buffer.append_type_and_fast_value(val);
}
postings_writer.subscribe(doc, 0u32, term_buffer, ctx);
}
@@ -167,30 +166,12 @@ pub(crate) fn index_json_value<'a, V: Value<'a>>(
postings_writer.subscribe(doc, 0u32, term_buffer, ctx);
}
ReferenceValueLeaf::F64(val) => {
if !val.is_finite() {
return;
};
set_path_id(
term_buffer,
ctx.path_to_unordered_id
.get_or_allocate_unordered_id(json_path_writer.as_str()),
);
// Normalize here is important.
// In the inverted index, we coerce all numerical values to their canonical
// representation.
//
// (We do the same thing on the query side)
match NumericalValue::F64(val).normalize() {
NumericalValue::I64(val_i64) => {
term_buffer.append_type_and_fast_value::<i64>(val_i64);
}
NumericalValue::U64(val_u64) => {
term_buffer.append_type_and_fast_value::<u64>(val_u64);
}
NumericalValue::F64(val_f64) => {
term_buffer.append_type_and_fast_value::<f64>(val_f64);
}
}
term_buffer.append_type_and_fast_value(val);
postings_writer.subscribe(doc, 0u32, term_buffer, ctx);
}
ReferenceValueLeaf::Bool(val) => {
@@ -260,8 +241,8 @@ pub(crate) fn index_json_value<'a, V: Value<'a>>(
///
/// The term must be json + JSON path.
pub fn convert_to_fast_value_and_append_to_json_term(
term: &Term,
text: &str,
mut term: Term,
phrase: &str,
truncate_date_for_search: bool,
) -> Option<Term> {
assert_eq!(
@@ -273,50 +254,31 @@ pub fn convert_to_fast_value_and_append_to_json_term(
0,
"JSON value bytes should be empty"
);
try_convert_to_datetime_and_append_to_json_term(term, text, truncate_date_for_search)
.or_else(|| try_convert_to_number_and_append_to_json_term(term, text))
.or_else(|| try_convert_to_bool_and_append_to_json_term_typed(term, text))
}
fn try_convert_to_datetime_and_append_to_json_term(
term: &Term,
text: &str,
truncate_date_for_search: bool,
) -> Option<Term> {
let dt = OffsetDateTime::parse(text, &Rfc3339).ok()?;
let mut dt = DateTime::from_utc(dt.to_offset(UtcOffset::UTC));
if truncate_date_for_search {
dt = dt.truncate(DATE_TIME_PRECISION_INDEXED);
if let Ok(dt) = OffsetDateTime::parse(phrase, &Rfc3339) {
let mut dt = DateTime::from_utc(dt.to_offset(UtcOffset::UTC));
if truncate_date_for_search {
dt = dt.truncate(DATE_TIME_PRECISION_INDEXED);
}
term.append_type_and_fast_value(dt);
return Some(term);
}
let mut term_clone = term.clone();
term_clone.append_type_and_fast_value(dt);
Some(term_clone)
}
fn try_convert_to_number_and_append_to_json_term(term: &Term, text: &str) -> Option<Term> {
let numerical_value: NumericalValue = str::parse::<NumericalValue>(text).ok()?;
let mut term_clone = term.clone();
// Parse is actually returning normalized values already today, but let's not
// not rely on that hidden contract.
match numerical_value.normalize() {
NumericalValue::I64(i64_value) => {
term_clone.append_type_and_fast_value::<i64>(i64_value);
}
NumericalValue::U64(u64_value) => {
term_clone.append_type_and_fast_value::<u64>(u64_value);
}
NumericalValue::F64(f64_value) => {
term_clone.append_type_and_fast_value::<f64>(f64_value);
}
if let Ok(i64_val) = str::parse::<i64>(phrase) {
term.append_type_and_fast_value(i64_val);
return Some(term);
}
Some(term_clone)
}
fn try_convert_to_bool_and_append_to_json_term_typed(term: &Term, text: &str) -> Option<Term> {
let val = str::parse::<bool>(text).ok()?;
let mut term_clone = term.clone();
term_clone.append_type_and_fast_value(val);
Some(term_clone)
if let Ok(u64_val) = str::parse::<u64>(phrase) {
term.append_type_and_fast_value(u64_val);
return Some(term);
}
if let Ok(f64_val) = str::parse::<f64>(phrase) {
term.append_type_and_fast_value(f64_val);
return Some(term);
}
if let Ok(bool_val) = str::parse::<bool>(phrase) {
term.append_type_and_fast_value(bool_val);
return Some(term);
}
None
}
/// Splits a json path supplied to the query parser in such a way that

View File

@@ -484,8 +484,10 @@ impl Directory for MmapDirectory {
.map_err(LockError::wrap_io_error)?;
if lock.is_blocking {
file.lock_exclusive().map_err(LockError::wrap_io_error)?;
} else if !file.try_lock_exclusive().map_err(|_| LockError::LockBusy)? {
return Err(LockError::LockBusy);
} else {
if !file.try_lock_exclusive().map_err(|_| LockError::LockBusy)? {
return Err(LockError::LockBusy);
}
}
// dropping the file handle will release the lock.
Ok(DirectoryLock::from(Box::new(ReleaseLockFile {

View File

@@ -146,7 +146,7 @@ impl InvertedIndexReader {
positions_size: ByteCount::default(),
num_terms: 0u64,
};
field_space.record(term_info);
field_space.record(&term_info);
// We include the json type and the json end of path to make sure the prefix check
// is meaningful.

View File

@@ -615,7 +615,7 @@ impl<D: Document> IndexWriter<D> {
/// It is also possible to add a payload to the `commit`
/// using this API.
/// See [`PreparedCommit::set_payload()`].
pub fn prepare_commit(&mut self) -> crate::Result<PreparedCommit<'_, D>> {
pub fn prepare_commit(&mut self) -> crate::Result<PreparedCommit<D>> {
// Here, because we join all of the worker threads,
// all of the segment update for this commit have been
// sent.

View File

@@ -55,7 +55,7 @@
//! // between indexing threads.
//! let mut index_writer: IndexWriter = index.writer(100_000_000)?;
//!
//! // Let's index a document!
//! // Let's index one documents!
//! index_writer.add_document(doc!(
//! title => "The Old Man and the Sea",
//! body => "He was an old man who fished alone in a skiff in \
@@ -370,8 +370,6 @@ macro_rules! fail_point {
/// Common test utilities.
#[cfg(test)]
pub mod tests {
use std::collections::BTreeMap;
use common::{BinarySerializable, FixedSize};
use query_grammar::{UserInputAst, UserInputLeaf, UserInputLiteral};
use rand::distributions::{Bernoulli, Uniform};
@@ -384,7 +382,7 @@ pub mod tests {
use crate::index::SegmentReader;
use crate::merge_policy::NoMergePolicy;
use crate::postings::Postings;
use crate::query::{BooleanQuery, QueryParser};
use crate::query::BooleanQuery;
use crate::schema::*;
use crate::{DateTime, DocAddress, Index, IndexWriter, ReloadPolicy};
@@ -1225,49 +1223,4 @@ pub mod tests {
);
assert_eq!(dt_from_ts_nanos.to_hms_micro(), offset_dt.to_hms_micro());
}
#[test]
fn test_json_number_ambiguity() {
let mut schema_builder = Schema::builder();
let json_field = schema_builder.add_json_field("number", crate::schema::TEXT);
let schema = schema_builder.build();
let index = Index::create_in_ram(schema);
let mut index_writer = index.writer_for_tests().unwrap();
{
let mut doc = TantivyDocument::new();
let mut obj = BTreeMap::default();
obj.insert("key".to_string(), OwnedValue::I64(1i64));
doc.add_object(json_field, obj);
index_writer.add_document(doc).unwrap();
}
{
let mut doc = TantivyDocument::new();
let mut obj = BTreeMap::default();
obj.insert("key".to_string(), OwnedValue::U64(1u64));
doc.add_object(json_field, obj);
index_writer.add_document(doc).unwrap();
}
{
let mut doc = TantivyDocument::new();
let mut obj = BTreeMap::default();
obj.insert("key".to_string(), OwnedValue::F64(1.0f64));
doc.add_object(json_field, obj);
index_writer.add_document(doc).unwrap();
}
index_writer.commit().unwrap();
let searcher = index.reader().unwrap().searcher();
assert_eq!(searcher.num_docs(), 3);
{
let parser = QueryParser::for_index(&index, vec![]);
let query = parser.parse_query("number.key:1").unwrap();
let count = searcher.search(&query, &crate::collector::Count).unwrap();
assert_eq!(count, 3);
}
{
let parser = QueryParser::for_index(&index, vec![]);
let query = parser.parse_query("number.key:1.0").unwrap();
let count = searcher.search(&query, &crate::collector::Count).unwrap();
assert_eq!(count, 3);
}
}
}

View File

@@ -227,6 +227,19 @@ impl BlockSegmentPostings {
self.doc_decoder.output_array()
}
/// Returns a full block, regardless of whether the block is complete or incomplete (
/// as it happens for the last block of the posting list).
///
/// In the latter case, the block is guaranteed to be padded with the sentinel value:
/// `TERMINATED`. The array is also guaranteed to be aligned on 16 bytes = 128 bits.
///
/// This method is useful to run SSE2 linear search.
#[inline]
pub(crate) fn full_block(&self) -> &[DocId; COMPRESSION_BLOCK_SIZE] {
debug_assert!(self.block_is_loaded());
self.doc_decoder.full_output()
}
/// Return the document at index `idx` of the block.
#[inline]
pub fn doc(&self, idx: usize) -> u32 {
@@ -262,36 +275,22 @@ impl BlockSegmentPostings {
///
/// If all docs are smaller than target, the block loaded may be empty,
/// or be the last an incomplete VInt block.
pub fn seek(&mut self, target_doc: DocId) -> usize {
// Move to the block that might contain our document.
self.seek_block(target_doc);
pub fn seek(&mut self, target_doc: DocId) {
self.shallow_seek(target_doc);
self.load_block();
// At this point we are on the block that might contain our document.
let doc = self.doc_decoder.seek_within_block(target_doc);
// The last block is not full and padded with TERMINATED,
// so we are guaranteed to have at least one value (real or padding)
// that is >= target_doc.
debug_assert!(doc < COMPRESSION_BLOCK_SIZE);
// `doc` is now the first element >= `target_doc`.
// If all docs are smaller than target, the current block is incomplete and padded
// with TERMINATED. After the search, the cursor points to the first TERMINATED.
doc
}
pub(crate) fn position_offset(&self) -> u64 {
self.skip_reader.position_offset()
}
/// Dangerous API! This calls seeks the next block on the skip list,
/// Dangerous API! This calls seek on the skip list,
/// but does not `.load_block()` afterwards.
///
/// `.load_block()` needs to be called manually afterwards.
/// If all docs are smaller than target, the block loaded may be empty,
/// or be the last an incomplete VInt block.
pub(crate) fn seek_block(&mut self, target_doc: DocId) {
pub(crate) fn shallow_seek(&mut self, target_doc: DocId) {
if self.skip_reader.seek(target_doc) {
self.block_max_score_cache = None;
self.block_loaded = false;

View File

@@ -151,11 +151,9 @@ impl BlockDecoder {
&self.output[..self.output_len]
}
/// Return in-block index of first value >= `target`.
/// Uses the padded buffer to enable branchless search.
#[inline]
pub(crate) fn seek_within_block(&self, target: u32) -> usize {
crate::postings::branchless_binary_search(&self.output, target)
pub(crate) fn full_output(&self) -> &[u32; COMPRESSION_BLOCK_SIZE] {
&self.output
}
#[inline]

View File

@@ -4,7 +4,7 @@ use crate::docset::DocSet;
use crate::fastfield::AliveBitSet;
use crate::positions::PositionReader;
use crate::postings::compression::COMPRESSION_BLOCK_SIZE;
use crate::postings::{BlockSegmentPostings, Postings};
use crate::postings::{branchless_binary_search, BlockSegmentPostings, Postings};
use crate::{DocId, TERMINATED};
/// `SegmentPostings` represents the inverted list or postings associated with
@@ -175,11 +175,26 @@ impl DocSet for SegmentPostings {
return self.doc();
}
// Delegate block-local search to BlockSegmentPostings::seek, which returns
// the in-block index of the first doc >= target.
self.cur = self.block_cursor.seek(target);
let doc = self.doc();
self.block_cursor.seek(target);
// At this point we are on the block, that might contain our document.
let output = self.block_cursor.full_block();
self.cur = branchless_binary_search(output, target);
// The last block is not full and padded with the value TERMINATED,
// so that we are guaranteed to have at least doc in the block (a real one or the padding)
// that is greater or equal to the target.
debug_assert!(self.cur < COMPRESSION_BLOCK_SIZE);
// `doc` is now the first element >= `target`
// If all docs are smaller than target the current block should be incomplemented and padded
// with the value `TERMINATED`.
//
// After the search, the cursor should point to the first value of TERMINATED.
let doc = output[self.cur];
debug_assert!(doc >= target);
debug_assert_eq!(doc, self.doc());
doc
}

View File

@@ -75,7 +75,7 @@ impl InvertedIndexSerializer {
field: Field,
total_num_tokens: u64,
fieldnorm_reader: Option<FieldNormReader>,
) -> io::Result<FieldSerializer<'_>> {
) -> io::Result<FieldSerializer> {
let field_entry: &FieldEntry = self.schema.get_field_entry(field);
let term_dictionary_write = self.terms_write.for_field(field);
let postings_write = self.postings_write.for_field(field);
@@ -126,7 +126,7 @@ impl<'a> FieldSerializer<'a> {
let term_dictionary_builder = TermDictionaryBuilder::create(term_dictionary_write)?;
let average_fieldnorm = fieldnorm_reader
.as_ref()
.map(|ff_reader| total_num_tokens as Score / ff_reader.num_docs() as Score)
.map(|ff_reader| (total_num_tokens as Score / ff_reader.num_docs() as Score))
.unwrap_or(0.0);
let postings_serializer = PostingsSerializer::new(
postings_write,

View File

@@ -1,3 +1,5 @@
use serde::{Deserialize, Serialize};
use crate::fieldnorm::FieldNormReader;
use crate::query::Explanation;
use crate::schema::Field;
@@ -66,6 +68,12 @@ fn compute_tf_cache(average_fieldnorm: Score) -> [Score; 256] {
cache
}
#[derive(Clone, PartialEq, Debug, Serialize, Deserialize)]
pub struct Bm25Params {
pub idf: Score,
pub avg_fieldnorm: Score,
}
/// A struct used for computing BM25 scores.
#[derive(Clone)]
pub struct Bm25Weight {

View File

@@ -167,7 +167,7 @@ pub fn block_wand(
let block_max_score_upperbound: Score = scorers[..pivot_len]
.iter_mut()
.map(|scorer| {
scorer.seek_block(pivot_doc);
scorer.shallow_seek(pivot_doc);
scorer.block_max_score()
})
.sum();
@@ -234,7 +234,7 @@ pub fn block_wand_single_scorer(
return;
}
doc = last_doc_in_block + 1;
scorer.seek_block(doc);
scorer.shallow_seek(doc);
}
// Seek will effectively load that block.
doc = scorer.seek(doc);
@@ -256,7 +256,7 @@ pub fn block_wand_single_scorer(
}
}
doc += 1;
scorer.seek_block(doc);
scorer.shallow_seek(doc);
}
}
@@ -302,6 +302,7 @@ fn is_sorted<I: Iterator<Item = DocId>>(mut it: I) -> bool {
mod tests {
use std::cmp::Ordering;
use std::collections::BinaryHeap;
use std::iter;
use proptest::prelude::*;

View File

@@ -1,15 +1,12 @@
use core::fmt::Debug;
use columnar::{ColumnIndex, DynamicColumn};
use common::BitSet;
use super::{ConstScorer, EmptyScorer};
use crate::docset::{DocSet, TERMINATED};
use crate::index::SegmentReader;
use crate::query::all_query::AllScorer;
use crate::query::boost_query::BoostScorer;
use crate::query::explanation::does_not_match;
use crate::query::{BitSetDocSet, EnableScoring, Explanation, Query, Scorer, Weight};
use crate::query::{EnableScoring, Explanation, Query, Scorer, Weight};
use crate::schema::Type;
use crate::{DocId, Score, TantivyError};
@@ -116,49 +113,13 @@ impl Weight for ExistsWeight {
non_empty_columns.push(column)
}
}
if non_empty_columns.is_empty() {
return Ok(Box::new(EmptyScorer));
}
// If any column is full, all docs match.
let max_doc = reader.max_doc();
if non_empty_columns
.iter()
.any(|col| matches!(col.column_index(), ColumnIndex::Full))
{
let all_scorer = AllScorer::new(max_doc);
return Ok(Box::new(BoostScorer::new(all_scorer, boost)));
}
// If we have a single dynamic column, use ExistsDocSet
// NOTE: A lower number may be better for very sparse columns
if non_empty_columns.len() < 4 {
// TODO: we can optimizer more here since in most cases we will have only one index
if !non_empty_columns.is_empty() {
let docset = ExistsDocSet::new(non_empty_columns, reader.max_doc());
return Ok(Box::new(ConstScorer::new(docset, boost)));
Ok(Box::new(ConstScorer::new(docset, boost)))
} else {
Ok(Box::new(EmptyScorer))
}
// If we have many dynamic columns, precompute a bitset of matching docs
let mut doc_bitset = BitSet::with_max_value(max_doc);
for column in &non_empty_columns {
match column.column_index() {
ColumnIndex::Empty { .. } => {}
ColumnIndex::Full => {
// Handled by AllScorer return above.
}
ColumnIndex::Optional(optional_index) => {
for doc in optional_index.iter_non_null_docs() {
doc_bitset.insert(doc);
}
}
ColumnIndex::Multivalued(multi_idx) => {
for doc in multi_idx.iter_non_null_docs() {
doc_bitset.insert(doc);
}
}
}
}
let docset = BitSetDocSet::from(doc_bitset);
Ok(Box::new(ConstScorer::new(docset, boost)))
}
fn explain(&self, reader: &SegmentReader, doc: DocId) -> crate::Result<Explanation> {
@@ -333,43 +294,6 @@ mod tests {
Ok(())
}
#[test]
fn test_exists_query_json_union_no_single_full_subpath() -> crate::Result<()> {
// Build docs where no single subpath exists for all docs, but the union does.
let mut schema_builder = Schema::builder();
let json = schema_builder.add_json_field("json", TEXT | FAST);
let schema = schema_builder.build();
let index = Index::create_in_ram(schema);
{
let mut index_writer = index.writer_for_tests()?;
for i in 0u64..100u64 {
if i % 2 == 0 {
// only subpath `a`
index_writer.add_document(doc!(json => json!({"a": i})))?;
} else {
// only subpath `b`
index_writer.add_document(doc!(json => json!({"b": i})))?;
}
}
index_writer.commit()?;
}
let reader = index.reader()?;
let searcher = reader.searcher();
// No single subpath is full
assert_eq!(count_existing_fields(&searcher, "json.a", false)?, 50);
assert_eq!(count_existing_fields(&searcher, "json.b", false)?, 50);
// Root exists with subpaths disabled is zero
assert_eq!(count_existing_fields(&searcher, "json", false)?, 0);
// Root exists with subpaths enabled should match all docs via union
assert_eq!(count_existing_fields(&searcher, "json", true)?, 100);
Ok(())
}
#[test]
fn test_exists_query_misc_supported_types() -> crate::Result<()> {
let mut schema_builder = Schema::builder();

View File

@@ -104,7 +104,7 @@ mod tests {
let query = query_parser.parse_query("a a a a a").unwrap();
let mut terms = Vec::new();
query.query_terms(&mut |term, pos| terms.push((term, pos)));
assert_eq!(vec![(&term_a, false); 1], terms);
assert_eq!(vec![(&term_a, false); 5], terms);
}
{
let query = query_parser.parse_query("a -b").unwrap();

View File

@@ -1,11 +1,8 @@
use std::fmt;
use std::ops::Bound;
use std::sync::Arc;
use tantivy_fst::Regex;
use crate::query::Occur;
use crate::schema::{Field, Term};
use crate::schema::Term;
use crate::Score;
#[derive(Clone)]
@@ -24,10 +21,6 @@ pub enum LogicalLiteral {
elements: Vec<Term>,
},
All,
Regex {
pattern: Arc<Regex>,
field: Field,
},
}
pub enum LogicalAst {
@@ -45,7 +38,6 @@ impl LogicalAst {
}
}
// TODO: Move to rewrite_ast in query_grammar
pub fn simplify(self) -> LogicalAst {
match self {
LogicalAst::Clause(clauses) => {
@@ -155,10 +147,6 @@ impl fmt::Debug for LogicalLiteral {
write!(formatter, "]")
}
LogicalLiteral::All => write!(formatter, "*"),
LogicalLiteral::Regex {
ref pattern,
ref field,
} => write!(formatter, "Regex({field:?}, {pattern:?})"),
}
}
}

View File

@@ -2,14 +2,12 @@ use std::net::{AddrParseError, IpAddr};
use std::num::{ParseFloatError, ParseIntError};
use std::ops::Bound;
use std::str::{FromStr, ParseBoolError};
use std::sync::Arc;
use base64::engine::general_purpose::STANDARD as BASE64;
use base64::Engine;
use itertools::Itertools;
use query_grammar::{UserInputAst, UserInputBound, UserInputLeaf, UserInputLiteral};
use rustc_hash::FxHashMap;
use tantivy_fst::Regex;
use super::logical_ast::*;
use crate::index::Index;
@@ -17,7 +15,7 @@ use crate::json_utils::convert_to_fast_value_and_append_to_json_term;
use crate::query::range_query::{is_type_valid_for_fastfield_range_query, RangeQuery};
use crate::query::{
AllQuery, BooleanQuery, BoostQuery, EmptyQuery, FuzzyTermQuery, Occur, PhrasePrefixQuery,
PhraseQuery, Query, RegexQuery, TermQuery, TermSetQuery,
PhraseQuery, Query, TermQuery, TermSetQuery,
};
use crate::schema::{
Facet, FacetParseError, Field, FieldType, IndexRecordOption, IntoIpv6Addr, JsonObjectOptions,
@@ -208,7 +206,6 @@ pub struct QueryParser {
tokenizer_manager: TokenizerManager,
boost: FxHashMap<Field, Score>,
fuzzy: FxHashMap<Field, Fuzzy>,
regexes_allowed: bool,
}
#[derive(Clone)]
@@ -263,7 +260,6 @@ impl QueryParser {
conjunction_by_default: false,
boost: Default::default(),
fuzzy: Default::default(),
regexes_allowed: false,
}
}
@@ -324,11 +320,6 @@ impl QueryParser {
);
}
/// Allow regexes in queries
pub fn allow_regexes(&mut self) {
self.regexes_allowed = true;
}
/// Parse a query
///
/// Note that `parse_query` returns an error if the input
@@ -495,17 +486,24 @@ impl QueryParser {
Ok(terms.into_iter().next().unwrap())
}
FieldType::JsonObject(ref json_options) => {
let mut term = Term::from_field_json_path(
field,
json_path,
json_options.is_expand_dots_enabled(),
);
let get_term_with_path = || {
Term::from_field_json_path(
field,
json_path,
json_options.is_expand_dots_enabled(),
)
};
if let Some(term) =
// Try to convert the phrase to a fast value
convert_to_fast_value_and_append_to_json_term(&term, phrase, false)
convert_to_fast_value_and_append_to_json_term(
get_term_with_path(),
phrase,
false,
)
{
Ok(term)
} else {
let mut term = get_term_with_path();
term.append_type_and_str(phrase);
Ok(term)
}
@@ -672,7 +670,7 @@ impl QueryParser {
}
UserInputAst::Boost(ast, boost) => {
let (ast, errors) = self.compute_logical_ast_with_occur_lenient(*ast);
(ast.boost(boost.into_inner() as Score), errors)
(ast.boost(boost as Score), errors)
}
UserInputAst::Leaf(leaf) => {
let (ast, errors) = self.compute_logical_ast_from_leaf_lenient(*leaf);
@@ -862,51 +860,6 @@ impl QueryParser {
"Range query need to target a specific field.".to_string(),
)],
),
UserInputLeaf::Regex { field, pattern } => {
if !self.regexes_allowed {
return (
None,
vec![QueryParserError::UnsupportedQuery(
"Regex queries are not allowed.".to_string(),
)],
);
}
let full_path = try_tuple!(field.ok_or_else(|| {
QueryParserError::UnsupportedQuery(
"Regex query need to target a specific field.".to_string(),
)
}));
let (field, json_path) = try_tuple!(self
.split_full_path(&full_path)
.ok_or_else(|| QueryParserError::FieldDoesNotExist(full_path.clone())));
if !json_path.is_empty() {
return (
None,
vec![QueryParserError::UnsupportedQuery(
"Regex query does not support json paths.".to_string(),
)],
);
}
if !matches!(
self.schema.get_field_entry(field).field_type(),
FieldType::Str(_)
) {
return (
None,
vec![QueryParserError::UnsupportedQuery(
"Regex query only supported on text fields".to_string(),
)],
);
}
let pattern = try_tuple!(Regex::new(&pattern).map_err(|e| {
QueryParserError::UnsupportedQuery(format!("Invalid regex: {e}"))
}));
let logical_ast = LogicalAst::Leaf(Box::new(LogicalLiteral::Regex {
pattern: Arc::new(pattern),
field,
}));
(Some(logical_ast), Vec::new())
}
}
}
}
@@ -949,9 +902,6 @@ fn convert_literal_to_query(
LogicalLiteral::Range { lower, upper } => Box::new(RangeQuery::new(lower, upper)),
LogicalLiteral::Set { elements, .. } => Box::new(TermSetQuery::new(elements)),
LogicalLiteral::All => Box::new(AllQuery),
LogicalLiteral::Regex { pattern, field } => {
Box::new(RegexQuery::from_regex(pattern, field))
}
}
}
@@ -1021,7 +971,7 @@ fn generate_literals_for_json_object(
// Try to convert the phrase to a fast value
if let Some(term) =
convert_to_fast_value_and_append_to_json_term(&get_term_with_path(), phrase, true)
convert_to_fast_value_and_append_to_json_term(get_term_with_path(), phrase, true)
{
logical_literals.push(LogicalLiteral::Term(term));
}
@@ -1150,15 +1100,11 @@ mod test {
query: &str,
default_conjunction: bool,
default_fields: &[&'static str],
allow_regexes: bool,
) -> Result<LogicalAst, QueryParserError> {
let mut query_parser = make_query_parser_with_default_fields(default_fields);
if default_conjunction {
query_parser.set_conjunction_by_default();
}
if allow_regexes {
query_parser.allow_regexes();
}
query_parser.parse_query_to_logical_ast(query)
}
@@ -1170,7 +1116,6 @@ mod test {
query,
default_conjunction,
&["title", "text"],
true,
)
}
@@ -1185,7 +1130,6 @@ mod test {
query,
default_conjunction,
default_fields,
true,
)
.unwrap();
let query_str = format!("{query:?}");
@@ -2049,66 +1993,4 @@ mod test {
Err(QueryParserError::ExpectedInt(_))
);
}
#[test]
pub fn test_deduplication() {
let query = "be be";
test_parse_query_to_logical_ast_helper(
query,
"(Term(field=0, type=Str, \"be\") Term(field=1, type=Str, \"be\"))",
false,
);
}
#[test]
pub fn test_regex() {
let expected_regex = tantivy_fst::Regex::new(r".*b").unwrap();
test_parse_query_to_logical_ast_helper(
"title:/.*b/",
format!("Regex(Field(0), {:#?})", expected_regex).as_str(),
false,
);
// Invalid field
let err = parse_query_to_logical_ast("float:/.*b/", false).unwrap_err();
assert_eq!(
err.to_string(),
"Unsupported query: Regex query only supported on text fields"
);
// No field specified
let err = parse_query_to_logical_ast("/.*b/", false).unwrap_err();
assert_eq!(
err.to_string(),
"Unsupported query: Regex query need to target a specific field."
);
// Regex on a json path
let err = parse_query_to_logical_ast("title.subpath:/.*b/", false).unwrap_err();
assert_eq!(
err.to_string(),
"Unsupported query: Regex query does not support json paths."
);
// Invalid regex
let err = parse_query_to_logical_ast("title:/[A-Z*b/", false).unwrap_err();
assert_eq!(
err.to_string(),
"Unsupported query: Invalid regex: regex parse error:\n [A-Z*b\n ^\nerror: \
unclosed character class"
);
// Regexes not allowed
let err = parse_query_to_logical_ast_with_default_fields(
"title:/.*b/",
false,
&["title", "text"],
false,
)
.unwrap_err();
assert_eq!(
err.to_string(),
"Unsupported query: Regex queries are not allowed."
);
}
}

View File

@@ -12,14 +12,10 @@ pub use self::range_query_fastfield::*;
// TODO is this correct?
pub(crate) fn is_type_valid_for_fastfield_range_query(typ: Type) -> bool {
match typ {
Type::Str
| Type::U64
| Type::I64
| Type::F64
| Type::Bool
| Type::Date
| Type::Json
| Type::IpAddr => true,
Type::Str | Type::U64 | Type::I64 | Type::F64 | Type::Bool | Type::Date | Type::Json => {
true
}
Type::IpAddr => true,
Type::Facet | Type::Bytes => false,
}
}

View File

@@ -258,7 +258,7 @@ fn search_on_json_numerical_field(
let bounds = match typ.numerical_type().unwrap() {
NumericalType::I64 => {
let bounds = bounds.map_bound(|term| term.as_i64().unwrap());
let bounds = bounds.map_bound(|term| (term.as_i64().unwrap()));
match actual_column_type {
NumericalType::I64 => bounds.map_bound(|&term| term.to_u64()),
NumericalType::U64 => {
@@ -282,7 +282,7 @@ fn search_on_json_numerical_field(
}
}
NumericalType::U64 => {
let bounds = bounds.map_bound(|term| term.as_u64().unwrap());
let bounds = bounds.map_bound(|term| (term.as_u64().unwrap()));
match actual_column_type {
NumericalType::U64 => bounds.map_bound(|&term| term.to_u64()),
NumericalType::I64 => {
@@ -306,7 +306,7 @@ fn search_on_json_numerical_field(
}
}
NumericalType::F64 => {
let bounds = bounds.map_bound(|term| term.as_f64().unwrap());
let bounds = bounds.map_bound(|term| (term.as_f64().unwrap()));
match actual_column_type {
NumericalType::U64 => transform_from_f64_bounds::<u64>(&bounds),
NumericalType::I64 => transform_from_f64_bounds::<i64>(&bounds),

View File

@@ -11,7 +11,7 @@ mod tests {
use crate::docset::DocSet;
use crate::postings::compression::COMPRESSION_BLOCK_SIZE;
use crate::query::{EnableScoring, Query, QueryParser, Scorer, TermQuery};
use crate::schema::{Field, IndexRecordOption, Schema, FAST, STRING, TEXT};
use crate::schema::{Field, IndexRecordOption, Schema, STRING, TEXT};
use crate::{assert_nearly_equals, DocAddress, Index, IndexWriter, Term, TERMINATED};
#[test]
@@ -212,232 +212,4 @@ mod tests {
}
Ok(())
}
#[test]
fn test_term_query_fallback_to_fastfield() -> crate::Result<()> {
use crate::collector::Count;
use crate::schema::FAST;
// Create a FAST-only numeric field (not indexed)
let mut schema_builder = Schema::builder();
let num_field = schema_builder.add_u64_field("num", FAST);
let schema = schema_builder.build();
let index = Index::create_in_ram(schema);
{
let mut index_writer: IndexWriter = index.writer_for_tests()?;
index_writer.add_document(doc!(num_field => 10u64))?;
index_writer.add_document(doc!(num_field => 20u64))?;
index_writer.add_document(doc!(num_field => 10u64))?;
index_writer.commit()?;
}
let reader = index.reader()?;
let searcher = reader.searcher();
// TermQuery should fall back to a fastfield range query and match correctly.
let tq_10 = TermQuery::new(
Term::from_field_u64(num_field, 10u64),
IndexRecordOption::Basic,
);
let tq_20 = TermQuery::new(
Term::from_field_u64(num_field, 20u64),
IndexRecordOption::Basic,
);
let tq_30 = TermQuery::new(
Term::from_field_u64(num_field, 30u64),
IndexRecordOption::Basic,
);
let count_10 = searcher.search(&tq_10, &Count)?;
let count_20 = searcher.search(&tq_20, &Count)?;
let count_30 = searcher.search(&tq_30, &Count)?;
assert_eq!(count_10, 2);
assert_eq!(count_20, 1);
assert_eq!(count_30, 0);
Ok(())
}
#[test]
fn test_term_query_fallback_text_fast_only() -> crate::Result<()> {
use crate::collector::Count;
// FAST-only text field (not indexed)
let mut schema_builder = Schema::builder();
let text_field = schema_builder.add_text_field("text", FAST);
let schema = schema_builder.build();
let index = Index::create_in_ram(schema);
{
let mut index_writer: IndexWriter = index.writer_for_tests()?;
index_writer.add_document(doc!(text_field => "hello"))?;
index_writer.add_document(doc!(text_field => "world"))?;
index_writer.add_document(doc!(text_field => "hello"))?;
index_writer.commit()?;
}
let searcher = index.reader()?.searcher();
let tq_hello = TermQuery::new(
Term::from_field_text(text_field, "hello"),
IndexRecordOption::Basic,
);
let tq_world = TermQuery::new(
Term::from_field_text(text_field, "world"),
IndexRecordOption::Basic,
);
let tq_missing = TermQuery::new(
Term::from_field_text(text_field, "nope"),
IndexRecordOption::Basic,
);
assert_eq!(searcher.search(&tq_hello, &Count)?, 2);
assert_eq!(searcher.search(&tq_world, &Count)?, 1);
assert_eq!(searcher.search(&tq_missing, &Count)?, 0);
Ok(())
}
#[test]
fn test_term_query_fallback_json_fast_only() -> crate::Result<()> {
use crate::collector::Count;
use crate::fastfield::FastValue;
use crate::schema::FAST;
let mut schema_builder = Schema::builder();
let json_field = schema_builder.add_json_field("json", FAST);
let schema = schema_builder.build();
let index = Index::create_in_ram(schema.clone());
{
let mut index_writer: IndexWriter = index.writer_for_tests()?;
index_writer.add_document(doc!(json_field => json!({"a": 10, "b": "x"})))?;
index_writer.add_document(doc!(json_field => json!({"a": 20, "b": "y"})))?;
index_writer.add_document(doc!(json_field => json!({"a": 10, "b": "z"})))?;
index_writer.commit()?;
}
fn json_term_fast<T: FastValue>(field: Field, path: &str, v: T) -> Term {
let mut term = Term::from_field_json_path(field, path, true);
term.append_type_and_fast_value(v);
term
}
fn json_term_str(field: Field, path: &str, v: &str) -> Term {
let mut term = Term::from_field_json_path(field, path, true);
term.append_type_and_str(v);
term
}
let searcher = index.reader()?.searcher();
// numeric path match
let tq_a10 = TermQuery::new(
json_term_fast(json_field, "a", 10u64),
IndexRecordOption::Basic,
);
let tq_a20 = TermQuery::new(
json_term_fast(json_field, "a", 20u64),
IndexRecordOption::Basic,
);
let tq_a30 = TermQuery::new(
json_term_fast(json_field, "a", 30u64),
IndexRecordOption::Basic,
);
assert_eq!(searcher.search(&tq_a10, &Count)?, 2);
assert_eq!(searcher.search(&tq_a20, &Count)?, 1);
assert_eq!(searcher.search(&tq_a30, &Count)?, 0);
// string path match
let tq_bx = TermQuery::new(
json_term_str(json_field, "b", "x"),
IndexRecordOption::Basic,
);
let tq_by = TermQuery::new(
json_term_str(json_field, "b", "y"),
IndexRecordOption::Basic,
);
let tq_bm = TermQuery::new(
json_term_str(json_field, "b", "missing"),
IndexRecordOption::Basic,
);
assert_eq!(searcher.search(&tq_bx, &Count)?, 1);
assert_eq!(searcher.search(&tq_by, &Count)?, 1);
assert_eq!(searcher.search(&tq_bm, &Count)?, 0);
Ok(())
}
#[test]
fn test_term_query_fallback_ip_fast_only() -> crate::Result<()> {
use std::net::IpAddr;
use std::str::FromStr;
use crate::collector::Count;
use crate::schema::{IntoIpv6Addr, FAST};
let mut schema_builder = Schema::builder();
let ip_field = schema_builder.add_ip_addr_field("ip", FAST);
let schema = schema_builder.build();
let index = Index::create_in_ram(schema);
let ip1 = IpAddr::from_str("127.0.0.1").unwrap().into_ipv6_addr();
let ip2 = IpAddr::from_str("127.0.0.2").unwrap().into_ipv6_addr();
{
let mut index_writer: IndexWriter = index.writer_for_tests()?;
index_writer.add_document(doc!(ip_field => ip1))?;
index_writer.add_document(doc!(ip_field => ip2))?;
index_writer.add_document(doc!(ip_field => ip1))?;
index_writer.commit()?;
}
let searcher = index.reader()?.searcher();
let tq_ip1 = TermQuery::new(
Term::from_field_ip_addr(ip_field, ip1),
IndexRecordOption::Basic,
);
let tq_ip2 = TermQuery::new(
Term::from_field_ip_addr(ip_field, ip2),
IndexRecordOption::Basic,
);
let ip3 = IpAddr::from_str("127.0.0.3").unwrap().into_ipv6_addr();
let tq_ip3 = TermQuery::new(
Term::from_field_ip_addr(ip_field, ip3),
IndexRecordOption::Basic,
);
assert_eq!(searcher.search(&tq_ip1, &Count)?, 2);
assert_eq!(searcher.search(&tq_ip2, &Count)?, 1);
assert_eq!(searcher.search(&tq_ip3, &Count)?, 0);
Ok(())
}
#[test]
fn test_term_query_fallback_fastfield_with_scores_errors() -> crate::Result<()> {
use crate::collector::TopDocs;
// FAST-only numeric field (not indexed) should error when scoring is required
let mut schema_builder = Schema::builder();
let num_field = schema_builder.add_u64_field("num", FAST);
let schema = schema_builder.build();
let index = Index::create_in_ram(schema);
{
let mut index_writer: IndexWriter = index.writer_for_tests()?;
index_writer.add_document(doc!(num_field => 10u64))?;
index_writer.add_document(doc!(num_field => 20u64))?;
index_writer.commit()?;
}
let searcher = index.reader()?.searcher();
let tq = TermQuery::new(
Term::from_field_u64(num_field, 10u64),
IndexRecordOption::Basic,
);
// Using TopDocs requires scoring; since the field is not indexed,
// TermQuery cannot score and should return a SchemaError.
let res = searcher.search(&tq, &TopDocs::with_limit(1));
assert!(matches!(res, Err(crate::TantivyError::SchemaError(_))));
Ok(())
}
}

View File

@@ -1,10 +1,8 @@
use std::fmt;
use std::ops::Bound;
use super::term_weight::TermWeight;
use crate::query::bm25::Bm25Weight;
use crate::query::range_query::is_type_valid_for_fastfield_range_query;
use crate::query::{EnableScoring, Explanation, Query, RangeQuery, Weight};
use crate::query::{EnableScoring, Explanation, Query, Weight};
use crate::schema::IndexRecordOption;
use crate::Term;
@@ -101,7 +99,7 @@ impl TermQuery {
EnableScoring::Enabled {
statistics_provider,
..
} => Bm25Weight::for_terms(statistics_provider, std::slice::from_ref(&self.term))?,
} => Bm25Weight::for_terms(statistics_provider, &[self.term.clone()])?,
EnableScoring::Disabled { .. } => {
Bm25Weight::new(Explanation::new("<no score>", 1.0f32), 1.0f32)
}
@@ -124,24 +122,6 @@ impl TermQuery {
impl Query for TermQuery {
fn weight(&self, enable_scoring: EnableScoring<'_>) -> crate::Result<Box<dyn Weight>> {
// If the field is not indexed but is a suitable fast field, fall back to a range query
// on the fast field matching exactly this term.
//
// Note: This is considerable slower since it requires to scan the entire fast field.
// TODO: The range query would gain from having a single-value optimization
let schema = enable_scoring.schema();
let field_entry = schema.get_field_entry(self.term.field());
if !field_entry.is_indexed()
&& field_entry.is_fast()
&& is_type_valid_for_fastfield_range_query(self.term.typ())
&& !enable_scoring.is_scoring_enabled()
{
let range_query = RangeQuery::new(
Bound::Included(self.term.clone()),
Bound::Included(self.term.clone()),
);
return range_query.weight(enable_scoring);
}
Ok(Box::new(self.specialized_weight(enable_scoring)?))
}
fn query_terms<'a>(&'a self, visitor: &mut dyn FnMut(&'a Term, bool)) {

View File

@@ -25,8 +25,8 @@ impl TermScorer {
}
}
pub(crate) fn seek_block(&mut self, target_doc: DocId) {
self.postings.block_cursor.seek_block(target_doc);
pub(crate) fn shallow_seek(&mut self, target_doc: DocId) {
self.postings.block_cursor.shallow_seek(target_doc);
}
#[cfg(test)]
@@ -175,7 +175,7 @@ mod tests {
let fieldnorms: Vec<u32> = std::iter::repeat_n(10u32, 3_000).collect();
let mut term_scorer = TermScorer::create_for_test(&doc_and_tfs, &fieldnorms, bm25_weight);
assert_eq!(term_scorer.doc(), 0u32);
term_scorer.seek_block(1289);
term_scorer.shallow_seek(1289);
assert_eq!(term_scorer.doc(), 0u32);
term_scorer.seek(1289);
assert_eq!(term_scorer.doc(), 1290);
@@ -242,9 +242,9 @@ mod tests {
let bm25_weight = Bm25Weight::for_one_term(10, 129, 20.0);
let mut docs = TermScorer::create_for_test(&doc_tfs[..], &fieldnorms[..], bm25_weight);
assert_nearly_equals!(docs.block_max_score(), 2.5161593);
docs.seek_block(135);
docs.shallow_seek(135);
assert_nearly_equals!(docs.block_max_score(), 3.4597192);
docs.seek_block(256);
docs.shallow_seek(256);
// the block is not loaded yet.
assert_nearly_equals!(docs.block_max_score(), 5.2971773);
assert_eq!(256, docs.seek(256));
@@ -275,7 +275,7 @@ mod tests {
{
let mut term_scorer = term_weight.specialized_scorer(reader, 1.0)?;
for d in docs {
term_scorer.seek_block(d);
term_scorer.shallow_seek(d);
block_max_scores_b.push(term_scorer.block_max_score());
}
}

View File

@@ -5,10 +5,8 @@ use crate::query::score_combiner::{DoNothingCombiner, ScoreCombiner};
use crate::query::Scorer;
use crate::{DocId, Score};
// The buffered union looks ahead within a fixed-size sliding window
// of upcoming document IDs (the "horizon").
const HORIZON_NUM_TINYBITSETS: usize = HORIZON as usize / 64;
const HORIZON: u32 = 64u32 * 64u32;
const HORIZON_NUM_TINYBITSETS: usize = 64;
const HORIZON: u32 = 64u32 * HORIZON_NUM_TINYBITSETS as u32;
// `drain_filter` is not stable yet.
// This function is similar except that it does is not unstable, and
@@ -29,26 +27,12 @@ where P: FnMut(&mut T) -> bool {
/// Creates a `DocSet` that iterate through the union of two or more `DocSet`s.
pub struct BufferedUnionScorer<TScorer, TScoreCombiner = DoNothingCombiner> {
/// Active scorers (already filtered of `TERMINATED`).
docsets: Vec<TScorer>,
/// Sliding window presence map for upcoming docs.
///
/// There are `HORIZON_NUM_TINYBITSETS` buckets, each covering
/// a span of 64 doc IDs. Bucket `i` represents the range
/// `[window_start_doc + i*64, window_start_doc + (i+1)*64)`.
bitsets: Box<[TinySet; HORIZON_NUM_TINYBITSETS]>,
// Index of the current TinySet bucket within the sliding window.
bucket_idx: usize,
/// Per-doc score combiners for the current window.
///
/// these accumulators merge contributions from all scorers that
/// hit the same doc within the buffered window.
scores: Box<[TScoreCombiner; HORIZON as usize]>,
/// Start doc ID (inclusive) of the current sliding window.
window_start_doc: DocId,
/// Current doc ID of the union.
cursor: usize,
offset: DocId,
doc: DocId,
/// Combined score for current `doc` as produced by `TScoreCombiner`.
score: Score,
}
@@ -90,8 +74,8 @@ impl<TScorer: Scorer, TScoreCombiner: ScoreCombiner> BufferedUnionScorer<TScorer
docsets: non_empty_docsets,
bitsets: Box::new([TinySet::empty(); HORIZON_NUM_TINYBITSETS]),
scores: Box::new([score_combiner_fn(); HORIZON as usize]),
bucket_idx: HORIZON_NUM_TINYBITSETS,
window_start_doc: 0,
cursor: HORIZON_NUM_TINYBITSETS,
offset: 0,
doc: 0,
score: 0.0,
};
@@ -105,10 +89,8 @@ impl<TScorer: Scorer, TScoreCombiner: ScoreCombiner> BufferedUnionScorer<TScorer
fn refill(&mut self) -> bool {
if let Some(min_doc) = self.docsets.iter().map(DocSet::doc).min() {
// Reset the sliding window to start at the smallest doc
// across all scorers and prebuffer within the horizon.
self.window_start_doc = min_doc;
self.bucket_idx = 0;
self.offset = min_doc;
self.cursor = 0;
self.doc = min_doc;
refill(
&mut self.docsets,
@@ -123,16 +105,16 @@ impl<TScorer: Scorer, TScoreCombiner: ScoreCombiner> BufferedUnionScorer<TScorer
}
fn advance_buffered(&mut self) -> bool {
while self.bucket_idx < HORIZON_NUM_TINYBITSETS {
if let Some(val) = self.bitsets[self.bucket_idx].pop_lowest() {
let delta = val + (self.bucket_idx as u32) * 64;
self.doc = self.window_start_doc + delta;
while self.cursor < HORIZON_NUM_TINYBITSETS {
if let Some(val) = self.bitsets[self.cursor].pop_lowest() {
let delta = val + (self.cursor as u32) * 64;
self.doc = self.offset + delta;
let score_combiner = &mut self.scores[delta as usize];
self.score = score_combiner.score();
score_combiner.clear();
return true;
} else {
self.bucket_idx += 1;
self.cursor += 1;
}
}
false
@@ -162,19 +144,19 @@ where
if self.doc >= target {
return self.doc;
}
let gap = target - self.window_start_doc;
let gap = target - self.offset;
if gap < HORIZON {
// Our value is within the buffered horizon.
// Skipping to corresponding bucket.
let new_bucket_idx = gap as usize / 64;
for obsolete_tinyset in &mut self.bitsets[self.bucket_idx..new_bucket_idx] {
// Skipping to corresponding bucket.
let new_cursor = gap as usize / 64;
for obsolete_tinyset in &mut self.bitsets[self.cursor..new_cursor] {
obsolete_tinyset.clear();
}
for score_combiner in &mut self.scores[self.bucket_idx * 64..new_bucket_idx * 64] {
for score_combiner in &mut self.scores[self.cursor * 64..new_cursor * 64] {
score_combiner.clear();
}
self.bucket_idx = new_bucket_idx;
self.cursor = new_cursor;
// Advancing until we reach the end of the bucket
// or we reach a doc greater or equal to the target.
@@ -229,7 +211,7 @@ where
if self.doc == TERMINATED {
return 0;
}
let mut count = self.bitsets[self.bucket_idx..HORIZON_NUM_TINYBITSETS]
let mut count = self.bitsets[self.cursor..HORIZON_NUM_TINYBITSETS]
.iter()
.map(|bitset| bitset.len())
.sum::<u32>()
@@ -243,7 +225,7 @@ where
bitset.clear();
}
}
self.bucket_idx = HORIZON_NUM_TINYBITSETS;
self.cursor = HORIZON_NUM_TINYBITSETS;
count
}
}

View File

@@ -80,7 +80,6 @@
//! }
//!
//! /// Our custom iterator just helps us to avoid some messy generics.
//! #[allow(dead_code)]
//! pub struct MyCustomIter<'a>(btree_map::Iter<'a, Field, serde_json::Value>);
//! impl<'a> Iterator for MyCustomIter<'a> {
//! // Here we can see our field-value pairs being produced by the iterator.

View File

@@ -1561,6 +1561,7 @@ fn to_ascii(text: &str, output: &mut String) {
#[cfg(test)]
mod tests {
use std::iter;
use super::to_ascii;
use crate::tokenizer::{AsciiFoldingFilter, RawTokenizer, SimpleTokenizer, TextAnalyzer};

View File

@@ -308,9 +308,10 @@ impl<TSSTable: SSTable> Dictionary<TSSTable> {
}
}
_ => {
return Err(io::Error::other(format!(
"Unsupported sstable version, expected one of [2, 3], found {version}"
)));
return Err(io::Error::new(
io::ErrorKind::Other,
format!("Unsupported sstable version, expected one of [2, 3], found {version}"),
));
}
};
@@ -608,12 +609,12 @@ impl<TSSTable: SSTable> Dictionary<TSSTable> {
/// Returns a range builder, to stream all of the terms
/// within an interval.
pub fn range(&self) -> StreamerBuilder<'_, TSSTable> {
pub fn range(&self) -> StreamerBuilder<TSSTable> {
StreamerBuilder::new(self, AlwaysMatch)
}
/// Returns a range builder filtered with a prefix.
pub fn prefix_range<K: AsRef<[u8]>>(&self, prefix: K) -> StreamerBuilder<'_, TSSTable> {
pub fn prefix_range<K: AsRef<[u8]>>(&self, prefix: K) -> StreamerBuilder<TSSTable> {
let lower_bound = prefix.as_ref();
let mut upper_bound = lower_bound.to_vec();
for idx in (0..upper_bound.len()).rev() {
@@ -632,7 +633,7 @@ impl<TSSTable: SSTable> Dictionary<TSSTable> {
}
/// A stream of all the sorted terms.
pub fn stream(&self) -> io::Result<Streamer<'_, TSSTable>> {
pub fn stream(&self) -> io::Result<Streamer<TSSTable>> {
self.range().into_stream()
}

View File

@@ -54,14 +54,14 @@ pub fn merge_sstable<SST: SSTable, W: io::Write, M: ValueMerger<SST::Value>>(
}
}
for _ in 0..len - 1 {
if let Some(mut head) = heap.peek_mut()
&& head.0.key() == writer.last_inserted_key()
{
value_merger.add(head.0.value());
if !head.0.advance()? {
PeekMut::pop(head);
if let Some(mut head) = heap.peek_mut() {
if head.0.key() == writer.last_inserted_key() {
value_merger.add(head.0.value());
if !head.0.advance()? {
PeekMut::pop(head);
}
continue;
}
continue;
}
break;
}

View File

@@ -394,7 +394,7 @@ impl SSTableIndexBuilder {
fn fst_error_to_io_error(error: tantivy_fst::Error) -> io::Error {
match error {
tantivy_fst::Error::Fst(fst_error) => io::Error::other(fst_error),
tantivy_fst::Error::Fst(fst_error) => io::Error::new(io::ErrorKind::Other, fst_error),
tantivy_fst::Error::Io(ioerror) => ioerror,
}
}
@@ -438,7 +438,7 @@ impl BlockAddrBlockMetadata {
let ordinal_addr = range_start_addr + self.range_start_nbits as usize;
let range_end_addr = range_start_addr + num_bits;
if (range_end_addr + self.range_start_nbits as usize).div_ceil(8) > data.len() {
if (range_end_addr + self.range_start_nbits as usize + 7) / 8 > data.len() {
return None;
}

View File

@@ -274,12 +274,13 @@ impl SharedArenaHashMap {
let kv: KeyValue = self.table[bucket];
if kv.is_empty() {
return None;
} else if kv.hash == hash
&& let Some(val_addr) =
} else if kv.hash == hash {
if let Some(val_addr) =
self.get_value_addr_if_key_match(key, kv.key_value_addr, memory_arena)
{
let v = memory_arena.read(val_addr);
return Some(v);
{
let v = memory_arena.read(val_addr);
return Some(v);
}
}
}
}
@@ -333,14 +334,15 @@ impl SharedArenaHashMap {
self.set_bucket(hash, key_addr, bucket);
return val;
}
if kv.hash == hash
&& let Some(val_addr) =
if kv.hash == hash {
if let Some(val_addr) =
self.get_value_addr_if_key_match(key, kv.key_value_addr, memory_arena)
{
let v = memory_arena.read(val_addr);
let new_v = updater(Some(v));
memory_arena.write_at(val_addr, new_v);
return new_v;
{
let v = memory_arena.read(val_addr);
let new_v = updater(Some(v));
memory_arena.write_at(val_addr, new_v);
return new_v;
}
}
// This allows fetching the next bucket before the loop jmp
bucket = probe.next_probe();