mirror of
https://github.com/quickwit-oss/tantivy.git
synced 2026-01-09 10:32:55 +00:00
Compare commits
16 Commits
binggan-0.
...
dependabot
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
ec5748d795 | ||
|
|
c71ea7b2ef | ||
|
|
c35a782747 | ||
|
|
c66af2c0a9 | ||
|
|
f9ac055847 | ||
|
|
21d057059e | ||
|
|
dca508b4ca | ||
|
|
aebae9965d | ||
|
|
e7e3e3f44c | ||
|
|
2f2db16ec1 | ||
|
|
d152e29687 | ||
|
|
285bcc25c9 | ||
|
|
7b65ad922d | ||
|
|
99be20cedd | ||
|
|
5f026901b8 | ||
|
|
6dfa2df06f |
2
.github/workflows/coverage.yml
vendored
2
.github/workflows/coverage.yml
vendored
@@ -21,7 +21,7 @@ jobs:
|
||||
- name: Generate code coverage
|
||||
run: cargo +nightly-2024-07-01 llvm-cov --all-features --workspace --doctests --lcov --output-path lcov.info
|
||||
- name: Upload coverage to Codecov
|
||||
uses: codecov/codecov-action@v3
|
||||
uses: codecov/codecov-action@v5
|
||||
continue-on-error: true
|
||||
with:
|
||||
token: ${{ secrets.CODECOV_TOKEN }} # not required for public repos
|
||||
|
||||
@@ -46,7 +46,7 @@ The file of a segment has the format
|
||||
|
||||
```segment-id . ext```
|
||||
|
||||
The extension signals which data structure (or [`SegmentComponent`](src/core/segment_component.rs)) is stored in the file.
|
||||
The extension signals which data structure (or [`SegmentComponent`](src/index/segment_component.rs)) is stored in the file.
|
||||
|
||||
A small `meta.json` file is in charge of keeping track of the list of segments, as well as the schema.
|
||||
|
||||
@@ -102,7 +102,7 @@ but users can extend tantivy with their own implementation.
|
||||
|
||||
Tantivy's document follows a very strict schema, decided before building any index.
|
||||
|
||||
The schema defines all of the fields that the indexes [`Document`](src/schema/document.rs) may and should contain, their types (`text`, `i64`, `u64`, `Date`, ...) as well as how it should be indexed / represented in tantivy.
|
||||
The schema defines all of the fields that the indexes [`Document`](src/schema/document/mod.rs) may and should contain, their types (`text`, `i64`, `u64`, `Date`, ...) as well as how it should be indexed / represented in tantivy.
|
||||
|
||||
Depending on the type of the field, you can decide to
|
||||
|
||||
|
||||
10
CITATION.cff
Normal file
10
CITATION.cff
Normal file
@@ -0,0 +1,10 @@
|
||||
cff-version: 1.2.0
|
||||
message: "If you use this software, please cite it as below."
|
||||
authors:
|
||||
- alias: Quickwit Inc.
|
||||
website: "https://quickwit.io"
|
||||
title: "tantivy"
|
||||
version: 0.22.0
|
||||
doi: 10.5281/zenodo.13942948
|
||||
date-released: 2024-10-17
|
||||
url: "https://github.com/quickwit-oss/tantivy"
|
||||
@@ -11,7 +11,7 @@ repository = "https://github.com/quickwit-oss/tantivy"
|
||||
readme = "README.md"
|
||||
keywords = ["search", "information", "retrieval"]
|
||||
edition = "2021"
|
||||
rust-version = "1.66"
|
||||
rust-version = "1.75"
|
||||
exclude = ["benches/*.json", "benches/*.txt"]
|
||||
|
||||
[dependencies]
|
||||
@@ -43,8 +43,8 @@ bitpacking = { version = "0.9.2", default-features = false, features = [
|
||||
"bitpacker4x",
|
||||
] }
|
||||
census = "0.4.2"
|
||||
rustc-hash = "1.1.0"
|
||||
thiserror = "1.0.30"
|
||||
rustc-hash = "2.0.0"
|
||||
thiserror = "2.0.1"
|
||||
htmlescape = "0.3.1"
|
||||
fail = { version = "0.5.0", optional = true }
|
||||
time = { version = "0.3.35", features = ["serde-well-known"] }
|
||||
@@ -72,7 +72,7 @@ fnv = "1.0.7"
|
||||
winapi = "0.3.9"
|
||||
|
||||
[dev-dependencies]
|
||||
binggan = "0.12.0"
|
||||
binggan = "0.14.0"
|
||||
rand = "0.8.5"
|
||||
maplit = "1.0.2"
|
||||
matches = "0.1.9"
|
||||
|
||||
@@ -20,7 +20,6 @@ macro_rules! register {
|
||||
($runner:expr, $func:ident) => {
|
||||
$runner.register(stringify!($func), move |index| {
|
||||
$func(index);
|
||||
None
|
||||
})
|
||||
};
|
||||
}
|
||||
|
||||
@@ -35,8 +35,8 @@ const IMPLS: [FilterImplPerInstructionSet; 2] = [
|
||||
const IMPLS: [FilterImplPerInstructionSet; 1] = [FilterImplPerInstructionSet::Scalar];
|
||||
|
||||
impl FilterImplPerInstructionSet {
|
||||
#[allow(unused_variables)]
|
||||
#[inline]
|
||||
#[allow(unused_variables)] // on non-x86_64, code is unused.
|
||||
fn from(code: u8) -> FilterImplPerInstructionSet {
|
||||
#[cfg(target_arch = "x86_64")]
|
||||
if code == FilterImplPerInstructionSet::AVX2 as u8 {
|
||||
|
||||
@@ -23,7 +23,7 @@ downcast-rs = "1.2.0"
|
||||
proptest = "1"
|
||||
more-asserts = "0.3.1"
|
||||
rand = "0.8"
|
||||
binggan = "0.12.0"
|
||||
binggan = "0.14.0"
|
||||
|
||||
[[bench]]
|
||||
name = "bench_merge"
|
||||
|
||||
@@ -42,7 +42,6 @@ fn bench_group(mut runner: InputGroup<Column>) {
|
||||
}
|
||||
}
|
||||
black_box(sum);
|
||||
None
|
||||
});
|
||||
runner.register("access_first_vals", |column| {
|
||||
let mut sum = 0;
|
||||
@@ -63,7 +62,6 @@ fn bench_group(mut runner: InputGroup<Column>) {
|
||||
}
|
||||
|
||||
black_box(sum);
|
||||
None
|
||||
});
|
||||
runner.run();
|
||||
}
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
pub mod common;
|
||||
|
||||
use binggan::{black_box, BenchRunner};
|
||||
use binggan::BenchRunner;
|
||||
use common::{generate_columnar_with_name, Card};
|
||||
use tantivy_columnar::*;
|
||||
|
||||
@@ -29,7 +29,7 @@ fn main() {
|
||||
add_combo(Card::Multi, Card::Dense);
|
||||
add_combo(Card::Multi, Card::Sparse);
|
||||
|
||||
let runner: BenchRunner = BenchRunner::new();
|
||||
let mut runner: BenchRunner = BenchRunner::new();
|
||||
let mut group = runner.new_group();
|
||||
for (input_name, columnar_readers) in inputs.iter() {
|
||||
group.register_with_input(
|
||||
|
||||
@@ -66,7 +66,7 @@ impl<T: PartialOrd + Copy + std::fmt::Debug + Send + Sync + 'static + Default>
|
||||
&'a self,
|
||||
docs: &'a [u32],
|
||||
accessor: &Column<T>,
|
||||
) -> impl Iterator<Item = (DocId, T)> + '_ {
|
||||
) -> impl Iterator<Item = (DocId, T)> + 'a {
|
||||
if accessor.index.get_cardinality().is_full() {
|
||||
docs.iter().cloned().zip(self.val_cache.iter().cloned())
|
||||
} else {
|
||||
|
||||
@@ -82,7 +82,7 @@ impl<'a> SparseBlock<'a> {
|
||||
}
|
||||
|
||||
#[inline]
|
||||
#[allow(clippy::comparison_chain)]
|
||||
#[expect(clippy::comparison_chain)]
|
||||
// Looks for the element in the block. Returns the positions if found.
|
||||
fn binary_search(&self, target: u16) -> Result<u16, u16> {
|
||||
let data = &self.0;
|
||||
|
||||
@@ -128,7 +128,7 @@ pub fn open_u128_as_compact_u64(mut bytes: OwnedBytes) -> io::Result<Arc<dyn Col
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
pub mod tests {
|
||||
pub(crate) mod tests {
|
||||
use super::*;
|
||||
use crate::column_values::u64_based::{
|
||||
serialize_and_load_u64_based_column_values, serialize_u64_based_column_values,
|
||||
|
||||
@@ -122,7 +122,6 @@ impl<T> From<T> for ColumnOperation<T> {
|
||||
// In order to limit memory usage, and in order
|
||||
// to benefit from the stacker, we do this by serialization our data
|
||||
// as "Symbols".
|
||||
#[allow(clippy::from_over_into)]
|
||||
pub(super) trait SymbolValue: Clone + Copy {
|
||||
// Serializes the symbol into the given buffer.
|
||||
// Returns the number of bytes written into the buffer.
|
||||
|
||||
@@ -392,7 +392,7 @@ impl ColumnarWriter {
|
||||
|
||||
// Serialize [Dictionary, Column, dictionary num bytes U32::LE]
|
||||
// Column: [Column Index, Column Values, column index num bytes U32::LE]
|
||||
#[allow(clippy::too_many_arguments)]
|
||||
#[expect(clippy::too_many_arguments)]
|
||||
fn serialize_bytes_or_str_column(
|
||||
cardinality: Cardinality,
|
||||
num_docs: RowId,
|
||||
|
||||
@@ -19,7 +19,7 @@ time = { version = "0.3.10", features = ["serde-well-known"] }
|
||||
serde = { version = "1.0.136", features = ["derive"] }
|
||||
|
||||
[dev-dependencies]
|
||||
binggan = "0.12.0"
|
||||
binggan = "0.14.0"
|
||||
proptest = "1.0.0"
|
||||
rand = "0.8.4"
|
||||
|
||||
|
||||
@@ -15,7 +15,6 @@ fn bench_vint() {
|
||||
out += u64::from(buf[0]);
|
||||
}
|
||||
black_box(out);
|
||||
None
|
||||
});
|
||||
|
||||
let vals: Vec<u32> = (0..20_000).choose_multiple(&mut thread_rng(), 100_000);
|
||||
@@ -27,7 +26,6 @@ fn bench_vint() {
|
||||
out += u64::from(buf[0]);
|
||||
}
|
||||
black_box(out);
|
||||
None
|
||||
});
|
||||
}
|
||||
|
||||
@@ -43,24 +41,20 @@ fn bench_bitset() {
|
||||
tinyset.pop_lowest();
|
||||
tinyset.pop_lowest();
|
||||
black_box(tinyset);
|
||||
None
|
||||
});
|
||||
|
||||
let tiny_set = TinySet::empty().insert(10u32).insert(14u32).insert(21u32);
|
||||
runner.bench_function("bench_tinyset_sum", move |_| {
|
||||
assert_eq!(black_box(tiny_set).into_iter().sum::<u32>(), 45u32);
|
||||
None
|
||||
});
|
||||
|
||||
let v = [10u32, 14u32, 21u32];
|
||||
runner.bench_function("bench_tinyarr_sum", move |_| {
|
||||
black_box(v.iter().cloned().sum::<u32>());
|
||||
None
|
||||
});
|
||||
|
||||
runner.bench_function("bench_bitset_initialize", move |_| {
|
||||
black_box(BitSet::with_max_value(1_000_000));
|
||||
None
|
||||
});
|
||||
}
|
||||
|
||||
|
||||
@@ -130,11 +130,11 @@ pub fn replace_in_place(needle: u8, replacement: u8, bytes: &mut [u8]) {
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
pub mod test {
|
||||
pub(crate) mod test {
|
||||
|
||||
use proptest::prelude::*;
|
||||
|
||||
use super::{f64_to_u64, i64_to_u64, u64_to_f64, u64_to_i64, BinarySerializable, FixedSize};
|
||||
use super::{f64_to_u64, i64_to_u64, u64_to_f64, u64_to_i64};
|
||||
|
||||
fn test_i64_converter_helper(val: i64) {
|
||||
assert_eq!(u64_to_i64(i64_to_u64(val)), val);
|
||||
@@ -144,12 +144,6 @@ pub mod test {
|
||||
assert_eq!(u64_to_f64(f64_to_u64(val)), val);
|
||||
}
|
||||
|
||||
pub fn fixed_size_test<O: BinarySerializable + FixedSize + Default>() {
|
||||
let mut buffer = Vec::new();
|
||||
O::default().serialize(&mut buffer).unwrap();
|
||||
assert_eq!(buffer.len(), O::SIZE_IN_BYTES);
|
||||
}
|
||||
|
||||
proptest! {
|
||||
#[test]
|
||||
fn test_f64_converter_monotonicity_proptest((left, right) in (proptest::num::f64::NORMAL, proptest::num::f64::NORMAL)) {
|
||||
|
||||
@@ -74,14 +74,14 @@ impl FixedSize for () {
|
||||
|
||||
impl<T: BinarySerializable> BinarySerializable for Vec<T> {
|
||||
fn serialize<W: Write + ?Sized>(&self, writer: &mut W) -> io::Result<()> {
|
||||
VInt(self.len() as u64).serialize(writer)?;
|
||||
BinarySerializable::serialize(&VInt(self.len() as u64), writer)?;
|
||||
for it in self {
|
||||
it.serialize(writer)?;
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
fn deserialize<R: Read>(reader: &mut R) -> io::Result<Vec<T>> {
|
||||
let num_items = VInt::deserialize(reader)?.val();
|
||||
let num_items = <VInt as BinarySerializable>::deserialize(reader)?.val();
|
||||
let mut items: Vec<T> = Vec::with_capacity(num_items as usize);
|
||||
for _ in 0..num_items {
|
||||
let item = T::deserialize(reader)?;
|
||||
@@ -236,12 +236,12 @@ impl FixedSize for bool {
|
||||
impl BinarySerializable for String {
|
||||
fn serialize<W: Write + ?Sized>(&self, writer: &mut W) -> io::Result<()> {
|
||||
let data: &[u8] = self.as_bytes();
|
||||
VInt(data.len() as u64).serialize(writer)?;
|
||||
BinarySerializable::serialize(&VInt(data.len() as u64), writer)?;
|
||||
writer.write_all(data)
|
||||
}
|
||||
|
||||
fn deserialize<R: Read>(reader: &mut R) -> io::Result<String> {
|
||||
let string_length = VInt::deserialize(reader)?.val() as usize;
|
||||
let string_length = <VInt as BinarySerializable>::deserialize(reader)?.val() as usize;
|
||||
let mut result = String::with_capacity(string_length);
|
||||
reader
|
||||
.take(string_length as u64)
|
||||
@@ -253,12 +253,12 @@ impl BinarySerializable for String {
|
||||
impl<'a> BinarySerializable for Cow<'a, str> {
|
||||
fn serialize<W: Write + ?Sized>(&self, writer: &mut W) -> io::Result<()> {
|
||||
let data: &[u8] = self.as_bytes();
|
||||
VInt(data.len() as u64).serialize(writer)?;
|
||||
BinarySerializable::serialize(&VInt(data.len() as u64), writer)?;
|
||||
writer.write_all(data)
|
||||
}
|
||||
|
||||
fn deserialize<R: Read>(reader: &mut R) -> io::Result<Cow<'a, str>> {
|
||||
let string_length = VInt::deserialize(reader)?.val() as usize;
|
||||
let string_length = <VInt as BinarySerializable>::deserialize(reader)?.val() as usize;
|
||||
let mut result = String::with_capacity(string_length);
|
||||
reader
|
||||
.take(string_length as u64)
|
||||
@@ -269,18 +269,18 @@ impl<'a> BinarySerializable for Cow<'a, str> {
|
||||
|
||||
impl<'a> BinarySerializable for Cow<'a, [u8]> {
|
||||
fn serialize<W: Write + ?Sized>(&self, writer: &mut W) -> io::Result<()> {
|
||||
VInt(self.len() as u64).serialize(writer)?;
|
||||
BinarySerializable::serialize(&VInt(self.len() as u64), writer)?;
|
||||
for it in self.iter() {
|
||||
it.serialize(writer)?;
|
||||
BinarySerializable::serialize(it, writer)?;
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn deserialize<R: Read>(reader: &mut R) -> io::Result<Cow<'a, [u8]>> {
|
||||
let num_items = VInt::deserialize(reader)?.val();
|
||||
let num_items = <VInt as BinarySerializable>::deserialize(reader)?.val();
|
||||
let mut items: Vec<u8> = Vec::with_capacity(num_items as usize);
|
||||
for _ in 0..num_items {
|
||||
let item = u8::deserialize(reader)?;
|
||||
let item = <u8 as BinarySerializable>::deserialize(reader)?;
|
||||
items.push(item);
|
||||
}
|
||||
Ok(Cow::Owned(items))
|
||||
|
||||
@@ -2,7 +2,7 @@
|
||||
|
||||
> Tantivy is a **search** engine **library** for Rust.
|
||||
|
||||
If you are familiar with Lucene, it's an excellent approximation to consider tantivy as Lucene for rust. tantivy is heavily inspired by Lucene's design and
|
||||
If you are familiar with Lucene, it's an excellent approximation to consider tantivy as Lucene for Rust. Tantivy is heavily inspired by Lucene's design and
|
||||
they both have the same scope and targeted use cases.
|
||||
|
||||
If you are not familiar with Lucene, let's break down our little tagline.
|
||||
@@ -17,7 +17,7 @@ relevancy, collapsing, highlighting, spatial search.
|
||||
experience. But keep in mind this is just a toolbox.
|
||||
Which bring us to the second keyword...
|
||||
|
||||
- **Library** means that you will have to write code. tantivy is not an *all-in-one* server solution like elastic search for instance.
|
||||
- **Library** means that you will have to write code. Tantivy is not an *all-in-one* server solution like Elasticsearch for instance.
|
||||
|
||||
Sometimes a functionality will not be available in tantivy because it is too
|
||||
specific to your use case. By design, tantivy should make it possible to extend
|
||||
@@ -31,4 +31,4 @@ relevancy, collapsing, highlighting, spatial search.
|
||||
index from a different format.
|
||||
|
||||
Tantivy exposes a lot of low level API to do all of these things.
|
||||
|
||||
|
||||
|
||||
@@ -11,7 +11,7 @@ directory shipped with tantivy is the `MmapDirectory`.
|
||||
While this design has some downsides, this greatly simplifies the source code of
|
||||
tantivy. Caching is also entirely delegated to the OS.
|
||||
|
||||
`tantivy` works entirely (or almost) by directly reading the datastructures as they are laid on disk. As a result, the act of opening an indexing does not involve loading different datastructures from the disk into random access memory : starting a process, opening an index, and performing your first query can typically be done in a matter of milliseconds.
|
||||
Tantivy works entirely (or almost) by directly reading the datastructures as they are laid on disk. As a result, the act of opening an indexing does not involve loading different datastructures from the disk into random access memory : starting a process, opening an index, and performing your first query can typically be done in a matter of milliseconds.
|
||||
|
||||
This is an interesting property for a command line search engine, or for some multi-tenant log search engine : spawning a new process for each new query can be a perfectly sensible solution in some use case.
|
||||
|
||||
|
||||
@@ -31,13 +31,13 @@ Compression ratio is mainly affected on the fast field of the sorted property, e
|
||||
When data is presorted by a field and search queries request sorting by the same field, we can leverage the natural order of the documents.
|
||||
E.g. if the data is sorted by timestamp and want the top n newest docs containing a term, we can simply leveraging the order of the docids.
|
||||
|
||||
Note: Tantivy 0.16 does not do this optimization yet.
|
||||
Note: tantivy 0.16 does not do this optimization yet.
|
||||
|
||||
### Pruning
|
||||
|
||||
Let's say we want all documents and want to apply the filter `>= 2010-08-11`. When the data is sorted, we could make a lookup in the fast field to find the docid range and use this as the filter.
|
||||
|
||||
Note: Tantivy 0.16 does not do this optimization yet.
|
||||
Note: tantivy 0.16 does not do this optimization yet.
|
||||
|
||||
### Other?
|
||||
|
||||
@@ -45,7 +45,7 @@ In principle there are many algorithms possible that exploit the monotonically i
|
||||
|
||||
## Usage
|
||||
|
||||
The index sorting can be configured setting [`sort_by_field`](https://github.com/quickwit-oss/tantivy/blob/000d76b11a139a84b16b9b95060a1c93e8b9851c/src/core/index_meta.rs#L238) on `IndexSettings` and passing it to a `IndexBuilder`. As of Tantivy 0.16 only fast fields are allowed to be used.
|
||||
The index sorting can be configured setting [`sort_by_field`](https://github.com/quickwit-oss/tantivy/blob/000d76b11a139a84b16b9b95060a1c93e8b9851c/src/core/index_meta.rs#L238) on `IndexSettings` and passing it to a `IndexBuilder`. As of tantivy 0.16 only fast fields are allowed to be used.
|
||||
|
||||
```rust
|
||||
let settings = IndexSettings {
|
||||
|
||||
@@ -39,7 +39,7 @@ Its representation is done by separating segments by a unicode char `\x01`, and
|
||||
- `value`: The value representation is just the regular Value representation.
|
||||
|
||||
This representation is designed to align the natural sort of Terms with the lexicographical sort
|
||||
of their binary representation (Tantivy's dictionary (whether fst or sstable) is sorted and does prefix encoding).
|
||||
of their binary representation (tantivy's dictionary (whether fst or sstable) is sorted and does prefix encoding).
|
||||
|
||||
In the example above, the terms will be sorted as
|
||||
|
||||
|
||||
@@ -151,7 +151,7 @@ impl fmt::Debug for OwnedBytes {
|
||||
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
|
||||
// We truncate the bytes in order to make sure the debug string
|
||||
// is not too long.
|
||||
let bytes_truncated: &[u8] = if self.len() > 8 {
|
||||
let bytes_truncated: &[u8] = if self.len() > 10 {
|
||||
&self.as_slice()[..10]
|
||||
} else {
|
||||
self.as_slice()
|
||||
@@ -252,6 +252,11 @@ mod tests {
|
||||
format!("{short_bytes:?}"),
|
||||
"OwnedBytes([97, 98, 99, 100], len=4)"
|
||||
);
|
||||
let medium_bytes = OwnedBytes::new(b"abcdefghi".as_ref());
|
||||
assert_eq!(
|
||||
format!("{medium_bytes:?}"),
|
||||
"OwnedBytes([97, 98, 99, 100, 101, 102, 103, 104, 105], len=9)"
|
||||
);
|
||||
let long_bytes = OwnedBytes::new(b"abcdefghijklmnopq".as_ref());
|
||||
assert_eq!(
|
||||
format!("{long_bytes:?}"),
|
||||
|
||||
@@ -111,7 +111,6 @@ where F: nom::Parser<I, (O, ErrorList), Infallible> {
|
||||
Err(Err::Incomplete(needed)) => Err(Err::Incomplete(needed)),
|
||||
// old versions don't understand this is uninhabited and need the empty match to help,
|
||||
// newer versions warn because this arm is unreachable (which it is indeed).
|
||||
#[allow(unreachable_patterns)]
|
||||
Err(Err::Error(val)) | Err(Err::Failure(val)) => match val {},
|
||||
}
|
||||
}
|
||||
|
||||
@@ -767,7 +767,7 @@ fn occur_leaf(inp: &str) -> IResult<&str, (Option<Occur>, UserInputAst)> {
|
||||
tuple((fallible(occur_symbol), boosted_leaf))(inp)
|
||||
}
|
||||
|
||||
#[allow(clippy::type_complexity)]
|
||||
#[expect(clippy::type_complexity)]
|
||||
fn operand_occur_leaf_infallible(
|
||||
inp: &str,
|
||||
) -> JResult<&str, (Option<BinaryOperand>, Option<Occur>, Option<UserInputAst>)> {
|
||||
|
||||
@@ -1,4 +1,5 @@
|
||||
//! Contains the final aggregation tree.
|
||||
//!
|
||||
//! This tree can be converted via the `into()` method from `IntermediateAggregationResults`.
|
||||
//! This conversion computes the final result. For example: The intermediate result contains
|
||||
//! intermediate average results, which is the sum and the number of values. The actual average is
|
||||
@@ -187,7 +188,7 @@ pub enum BucketEntries<T> {
|
||||
}
|
||||
|
||||
impl<T> BucketEntries<T> {
|
||||
fn iter<'a>(&'a self) -> Box<dyn Iterator<Item = &T> + 'a> {
|
||||
fn iter<'a>(&'a self) -> Box<dyn Iterator<Item = &'a T> + 'a> {
|
||||
match self {
|
||||
BucketEntries::Vec(vec) => Box::new(vec.iter()),
|
||||
BucketEntries::HashMap(map) => Box::new(map.values()),
|
||||
|
||||
@@ -244,7 +244,7 @@ fn parse_into_milliseconds(input: &str) -> Result<i64, AggregationError> {
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
pub mod tests {
|
||||
pub(crate) mod tests {
|
||||
use pretty_assertions::assert_eq;
|
||||
|
||||
use super::*;
|
||||
|
||||
@@ -16,6 +16,7 @@ use crate::aggregation::*;
|
||||
use crate::TantivyError;
|
||||
|
||||
/// Provide user-defined buckets to aggregate on.
|
||||
///
|
||||
/// Two special buckets will automatically be created to cover the whole range of values.
|
||||
/// The provided buckets have to be continuous.
|
||||
/// During the aggregation, the values extracted from the fast_field `field` will be checked
|
||||
|
||||
@@ -1232,8 +1232,8 @@ mod tests {
|
||||
#[test]
|
||||
fn terms_aggregation_min_doc_count_special_case() -> crate::Result<()> {
|
||||
let terms_per_segment = vec![
|
||||
vec!["terma", "terma", "termb", "termb", "termb", "termc"],
|
||||
vec!["terma", "terma", "termb", "termc", "termc"],
|
||||
vec!["terma", "terma", "termb", "termb", "termb"],
|
||||
vec!["terma", "terma", "termb"],
|
||||
];
|
||||
|
||||
let index = get_test_index_from_terms(false, &terms_per_segment)?;
|
||||
@@ -1255,8 +1255,6 @@ mod tests {
|
||||
assert_eq!(res["my_texts"]["buckets"][0]["doc_count"], 4);
|
||||
assert_eq!(res["my_texts"]["buckets"][1]["key"], "termb");
|
||||
assert_eq!(res["my_texts"]["buckets"][1]["doc_count"], 0);
|
||||
assert_eq!(res["my_texts"]["buckets"][2]["key"], "termc");
|
||||
assert_eq!(res["my_texts"]["buckets"][2]["doc_count"], 0);
|
||||
assert_eq!(res["my_texts"]["sum_other_doc_count"], 0);
|
||||
assert_eq!(res["my_texts"]["doc_count_error_upper_bound"], 0);
|
||||
|
||||
|
||||
@@ -180,7 +180,7 @@ pub(crate) fn deserialize_option_f64<'de, D>(deserializer: D) -> Result<Option<f
|
||||
where D: Deserializer<'de> {
|
||||
struct StringOrFloatVisitor;
|
||||
|
||||
impl<'de> Visitor<'de> for StringOrFloatVisitor {
|
||||
impl Visitor<'_> for StringOrFloatVisitor {
|
||||
type Value = Option<f64>;
|
||||
|
||||
fn expecting(&self, formatter: &mut fmt::Formatter) -> fmt::Result {
|
||||
@@ -226,7 +226,7 @@ pub(crate) fn deserialize_f64<'de, D>(deserializer: D) -> Result<f64, D::Error>
|
||||
where D: Deserializer<'de> {
|
||||
struct StringOrFloatVisitor;
|
||||
|
||||
impl<'de> Visitor<'de> for StringOrFloatVisitor {
|
||||
impl Visitor<'_> for StringOrFloatVisitor {
|
||||
type Value = f64;
|
||||
|
||||
fn expecting(&self, formatter: &mut fmt::Formatter) -> fmt::Result {
|
||||
|
||||
@@ -13,7 +13,7 @@ struct Hit<'a> {
|
||||
facet: &'a Facet,
|
||||
}
|
||||
|
||||
impl<'a> Eq for Hit<'a> {}
|
||||
impl Eq for Hit<'_> {}
|
||||
|
||||
impl<'a> PartialEq<Hit<'a>> for Hit<'a> {
|
||||
fn eq(&self, other: &Hit<'_>) -> bool {
|
||||
@@ -27,7 +27,7 @@ impl<'a> PartialOrd<Hit<'a>> for Hit<'a> {
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a> Ord for Hit<'a> {
|
||||
impl Ord for Hit<'_> {
|
||||
fn cmp(&self, other: &Self) -> Ordering {
|
||||
other
|
||||
.count
|
||||
|
||||
@@ -182,6 +182,7 @@ where
|
||||
}
|
||||
|
||||
/// A variant of the [`FilterCollector`] specialized for bytes fast fields, i.e.
|
||||
///
|
||||
/// it transparently wraps an inner [`Collector`] but filters documents
|
||||
/// based on the result of applying the predicate to the bytes fast field.
|
||||
///
|
||||
|
||||
@@ -495,4 +495,4 @@ where
|
||||
impl_downcast!(Fruit);
|
||||
|
||||
#[cfg(test)]
|
||||
pub mod tests;
|
||||
pub(crate) mod tests;
|
||||
|
||||
@@ -161,7 +161,7 @@ impl<TFruit: Fruit> FruitHandle<TFruit> {
|
||||
/// # Ok(())
|
||||
/// # }
|
||||
/// ```
|
||||
#[allow(clippy::type_complexity)]
|
||||
#[expect(clippy::type_complexity)]
|
||||
#[derive(Default)]
|
||||
pub struct MultiCollector<'a> {
|
||||
collector_wrappers: Vec<
|
||||
@@ -190,7 +190,7 @@ impl<'a> MultiCollector<'a> {
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a> Collector for MultiCollector<'a> {
|
||||
impl Collector for MultiCollector<'_> {
|
||||
type Fruit = MultiFruit;
|
||||
type Child = MultiCollectorChild;
|
||||
|
||||
|
||||
@@ -44,8 +44,19 @@ fn test_format_6() {
|
||||
assert_date_time_precision(&index, DateTimePrecision::Microseconds);
|
||||
}
|
||||
|
||||
/// feature flag quickwit uses a different dictionary type
|
||||
#[test]
|
||||
#[cfg(not(feature = "quickwit"))]
|
||||
fn assert_date_time_precision(index: &Index, precision: DateTimePrecision) {
|
||||
fn test_format_7() {
|
||||
let path = path_for_version("7");
|
||||
|
||||
let index = Index::open_in_dir(path).expect("Failed to open index");
|
||||
// dates are not truncated in v7 in the docstore
|
||||
assert_date_time_precision(&index, DateTimePrecision::Nanoseconds);
|
||||
}
|
||||
|
||||
#[cfg(not(feature = "quickwit"))]
|
||||
fn assert_date_time_precision(index: &Index, doc_store_precision: DateTimePrecision) {
|
||||
use collector::TopDocs;
|
||||
let reader = index.reader().expect("Failed to create reader");
|
||||
let searcher = reader.searcher();
|
||||
@@ -75,6 +86,6 @@ fn assert_date_time_precision(index: &Index, precision: DateTimePrecision) {
|
||||
.as_datetime()
|
||||
.unwrap();
|
||||
|
||||
let expected = DateTime::from_timestamp_nanos(123456).truncate(precision);
|
||||
let expected = DateTime::from_timestamp_nanos(123456).truncate(doc_store_precision);
|
||||
assert_eq!(date_value, expected,);
|
||||
}
|
||||
|
||||
@@ -71,7 +71,7 @@ pub fn json_path_sep_to_dot(path: &mut str) {
|
||||
}
|
||||
}
|
||||
|
||||
#[allow(clippy::too_many_arguments)]
|
||||
#[expect(clippy::too_many_arguments)]
|
||||
fn index_json_object<'a, V: Value<'a>>(
|
||||
doc: DocId,
|
||||
json_visitor: V::ObjectIter,
|
||||
@@ -101,7 +101,7 @@ fn index_json_object<'a, V: Value<'a>>(
|
||||
}
|
||||
}
|
||||
|
||||
#[allow(clippy::too_many_arguments)]
|
||||
#[expect(clippy::too_many_arguments)]
|
||||
pub(crate) fn index_json_value<'a, V: Value<'a>>(
|
||||
doc: DocId,
|
||||
json_value: V,
|
||||
|
||||
@@ -39,7 +39,7 @@ impl RetryPolicy {
|
||||
/// The `DirectoryLock` is an object that represents a file lock.
|
||||
///
|
||||
/// It is associated with a lock file, that gets deleted on `Drop.`
|
||||
#[allow(dead_code)]
|
||||
#[expect(dead_code)]
|
||||
pub struct DirectoryLock(Box<dyn Send + Sync + 'static>);
|
||||
|
||||
struct DirectoryLockGuard {
|
||||
|
||||
@@ -48,6 +48,7 @@ pub static INDEX_WRITER_LOCK: Lazy<Lock> = Lazy::new(|| Lock {
|
||||
});
|
||||
/// The meta lock file is here to protect the segment files being opened by
|
||||
/// `IndexReader::reload()` from being garbage collected.
|
||||
///
|
||||
/// It makes it possible for another process to safely consume
|
||||
/// our index in-writing. Ideally, we may have preferred `RWLock` semantics
|
||||
/// here, but it is difficult to achieve on Windows.
|
||||
|
||||
@@ -244,7 +244,7 @@ impl MmapDirectory {
|
||||
directory_path,
|
||||
)));
|
||||
}
|
||||
#[allow(clippy::bind_instead_of_map)]
|
||||
#[expect(clippy::bind_instead_of_map)]
|
||||
let canonical_path: PathBuf = directory_path.canonicalize().or_else(|io_err| {
|
||||
let directory_path = directory_path.to_owned();
|
||||
|
||||
|
||||
@@ -32,7 +32,7 @@ pub struct WatchCallbackList {
|
||||
/// file change is detected.
|
||||
#[must_use = "This `WatchHandle` controls the lifetime of the watch and should therefore be used."]
|
||||
#[derive(Clone)]
|
||||
#[allow(dead_code)]
|
||||
#[expect(dead_code)]
|
||||
pub struct WatchHandle(Arc<WatchCallback>);
|
||||
|
||||
impl WatchHandle {
|
||||
|
||||
@@ -117,7 +117,7 @@ pub trait DocSet: Send {
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a> DocSet for &'a mut dyn DocSet {
|
||||
impl DocSet for &mut dyn DocSet {
|
||||
fn advance(&mut self) -> u32 {
|
||||
(**self).advance()
|
||||
}
|
||||
|
||||
@@ -149,7 +149,7 @@ impl FieldNormReader {
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
pub fn for_test(field_norms: &[u32]) -> FieldNormReader {
|
||||
pub(crate) fn for_test(field_norms: &[u32]) -> FieldNormReader {
|
||||
let field_norms_id = field_norms
|
||||
.iter()
|
||||
.cloned()
|
||||
|
||||
@@ -1,12 +1,9 @@
|
||||
#![allow(deprecated)] // Remove with index sorting
|
||||
|
||||
use std::collections::HashSet;
|
||||
|
||||
use rand::{thread_rng, Rng};
|
||||
|
||||
use crate::indexer::index_writer::MEMORY_BUDGET_NUM_BYTES_MIN;
|
||||
use crate::schema::*;
|
||||
#[allow(deprecated)]
|
||||
use crate::{doc, schema, Index, IndexWriter, Searcher};
|
||||
|
||||
fn check_index_content(searcher: &Searcher, vals: &[u64]) -> crate::Result<()> {
|
||||
|
||||
@@ -31,7 +31,6 @@ pub struct InvertedIndexReader {
|
||||
}
|
||||
|
||||
impl InvertedIndexReader {
|
||||
#[allow(clippy::needless_pass_by_value)] // for symmetry
|
||||
pub(crate) fn new(
|
||||
termdict: TermDictionary,
|
||||
postings_file_slice: FileSlice,
|
||||
@@ -205,16 +204,6 @@ impl InvertedIndexReader {
|
||||
.transpose()
|
||||
}
|
||||
|
||||
pub(crate) fn read_postings_no_deletes(
|
||||
&self,
|
||||
term: &Term,
|
||||
option: IndexRecordOption,
|
||||
) -> io::Result<Option<SegmentPostings>> {
|
||||
self.get_term_info(term)?
|
||||
.map(|term_info| self.read_postings_from_terminfo(&term_info, option))
|
||||
.transpose()
|
||||
}
|
||||
|
||||
/// Returns the number of documents containing the term.
|
||||
pub fn doc_freq(&self, term: &Term) -> io::Result<u32> {
|
||||
Ok(self
|
||||
|
||||
@@ -1,6 +1,7 @@
|
||||
use std::slice;
|
||||
|
||||
/// Enum describing each component of a tantivy segment.
|
||||
///
|
||||
/// Each component is stored in its own file,
|
||||
/// using the pattern `segment_uuid`.`component_extension`,
|
||||
/// except the delete component that takes an `segment_uuid`.`delete_opstamp`.`component_extension`
|
||||
|
||||
@@ -478,7 +478,7 @@ pub fn merge_field_meta_data(
|
||||
.into_iter()
|
||||
.kmerge_by(|left, right| left < right)
|
||||
// TODO: Remove allocation
|
||||
.group_by(|el| (el.field_name.to_string(), el.typ))
|
||||
.chunk_by(|el| (el.field_name.to_string(), el.typ))
|
||||
{
|
||||
let mut merged: FieldMetadata = group.next().unwrap();
|
||||
for el in group {
|
||||
|
||||
@@ -187,7 +187,6 @@ impl DeleteCursor {
|
||||
}
|
||||
}
|
||||
|
||||
#[allow(clippy::wrong_self_convention)]
|
||||
fn is_behind_opstamp(&mut self, target_opstamp: Opstamp) -> bool {
|
||||
self.get()
|
||||
.map(|operation| operation.opstamp < target_opstamp)
|
||||
|
||||
@@ -21,7 +21,7 @@ pub enum DocToOpstampMapping<'a> {
|
||||
None,
|
||||
}
|
||||
|
||||
impl<'a> DocToOpstampMapping<'a> {
|
||||
impl DocToOpstampMapping<'_> {
|
||||
/// Assess whether a document should be considered deleted given that it contains
|
||||
/// a deleted term that was deleted at the opstamp: `delete_opstamp`.
|
||||
///
|
||||
|
||||
@@ -104,7 +104,7 @@ impl MergePolicy for LogMergePolicy {
|
||||
|
||||
let mut current_max_log_size = f64::MAX;
|
||||
let mut levels = vec![];
|
||||
for (_, merge_group) in &size_sorted_segments.into_iter().group_by(|segment| {
|
||||
for (_, merge_group) in &size_sorted_segments.into_iter().chunk_by(|segment| {
|
||||
let segment_log_size = f64::from(self.clip_min_size(segment.num_docs())).log2();
|
||||
if segment_log_size < (current_max_log_size - self.level_log_size) {
|
||||
// update current_max_log_size to create a new group
|
||||
|
||||
@@ -36,7 +36,7 @@ impl MergePolicy for NoMergePolicy {
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
pub mod tests {
|
||||
pub(crate) mod tests {
|
||||
|
||||
use super::*;
|
||||
|
||||
|
||||
@@ -150,7 +150,7 @@ impl SegmentWriter {
|
||||
let vals_grouped_by_field = doc
|
||||
.iter_fields_and_values()
|
||||
.sorted_by_key(|(field, _)| *field)
|
||||
.group_by(|(field, _)| *field);
|
||||
.chunk_by(|(field, _)| *field);
|
||||
|
||||
for (field, field_values) in &vals_grouped_by_field {
|
||||
let values = field_values.map(|el| el.1);
|
||||
|
||||
@@ -101,7 +101,7 @@ mod test {
|
||||
|
||||
use super::Stamper;
|
||||
|
||||
#[allow(clippy::redundant_clone)]
|
||||
#[expect(clippy::redundant_clone)]
|
||||
#[test]
|
||||
fn test_stamper() {
|
||||
let stamper = Stamper::new(7u64);
|
||||
@@ -117,7 +117,7 @@ mod test {
|
||||
assert_eq!(stamper.stamp(), 15u64);
|
||||
}
|
||||
|
||||
#[allow(clippy::redundant_clone)]
|
||||
#[expect(clippy::redundant_clone)]
|
||||
#[test]
|
||||
fn test_stamper_revert() {
|
||||
let stamper = Stamper::new(7u64);
|
||||
|
||||
11
src/lib.rs
11
src/lib.rs
@@ -178,10 +178,8 @@ pub use crate::future_result::FutureResult;
|
||||
pub type Result<T> = std::result::Result<T, TantivyError>;
|
||||
|
||||
mod core;
|
||||
#[allow(deprecated)] // Remove with index sorting
|
||||
pub mod indexer;
|
||||
|
||||
#[allow(unused_doc_comments)]
|
||||
pub mod error;
|
||||
pub mod tokenizer;
|
||||
|
||||
@@ -190,7 +188,6 @@ pub mod collector;
|
||||
pub mod directory;
|
||||
pub mod fastfield;
|
||||
pub mod fieldnorm;
|
||||
#[allow(deprecated)] // Remove with index sorting
|
||||
pub mod index;
|
||||
pub mod positions;
|
||||
pub mod postings;
|
||||
@@ -223,7 +220,6 @@ pub use self::docset::{DocSet, COLLECT_BLOCK_BUFFER_LEN, TERMINATED};
|
||||
pub use crate::core::json_utils;
|
||||
pub use crate::core::{Executor, Searcher, SearcherGeneration};
|
||||
pub use crate::directory::Directory;
|
||||
#[allow(deprecated)] // Remove with index sorting
|
||||
pub use crate::index::{
|
||||
Index, IndexBuilder, IndexMeta, IndexSettings, InvertedIndexReader, Order, Segment,
|
||||
SegmentMeta, SegmentReader,
|
||||
@@ -232,7 +228,7 @@ pub use crate::indexer::{IndexWriter, SingleSegmentIndexWriter};
|
||||
pub use crate::schema::{Document, TantivyDocument, Term};
|
||||
|
||||
/// Index format version.
|
||||
pub const INDEX_FORMAT_VERSION: u32 = 6;
|
||||
pub const INDEX_FORMAT_VERSION: u32 = 7;
|
||||
/// Oldest index format version this tantivy version can read.
|
||||
pub const INDEX_FORMAT_OLDEST_SUPPORTED_VERSION: u32 = 4;
|
||||
|
||||
@@ -371,6 +367,7 @@ macro_rules! fail_point {
|
||||
}};
|
||||
}
|
||||
|
||||
/// Common test utilities.
|
||||
#[cfg(test)]
|
||||
pub mod tests {
|
||||
use common::{BinarySerializable, FixedSize};
|
||||
@@ -389,6 +386,7 @@ pub mod tests {
|
||||
use crate::schema::*;
|
||||
use crate::{DateTime, DocAddress, Index, IndexWriter, ReloadPolicy};
|
||||
|
||||
/// Asserts that the serialized value is the value in the trait.
|
||||
pub fn fixed_size_test<O: BinarySerializable + FixedSize + Default>() {
|
||||
let mut buffer = Vec::new();
|
||||
O::default().serialize(&mut buffer).unwrap();
|
||||
@@ -421,6 +419,7 @@ pub mod tests {
|
||||
}};
|
||||
}
|
||||
|
||||
/// Generates random numbers
|
||||
pub fn generate_nonunique_unsorted(max_value: u32, n_elems: usize) -> Vec<u32> {
|
||||
let seed: [u8; 32] = [1; 32];
|
||||
StdRng::from_seed(seed)
|
||||
@@ -429,6 +428,7 @@ pub mod tests {
|
||||
.collect::<Vec<u32>>()
|
||||
}
|
||||
|
||||
/// Sample `n` elements with Bernoulli distribution.
|
||||
pub fn sample_with_seed(n: u32, ratio: f64, seed_val: u8) -> Vec<u32> {
|
||||
StdRng::from_seed([seed_val; 32])
|
||||
.sample_iter(&Bernoulli::new(ratio).unwrap())
|
||||
@@ -438,6 +438,7 @@ pub mod tests {
|
||||
.collect()
|
||||
}
|
||||
|
||||
/// Sample `n` elements with Bernoulli distribution.
|
||||
pub fn sample(n: u32, ratio: f64) -> Vec<u32> {
|
||||
sample_with_seed(n, ratio, 4)
|
||||
}
|
||||
|
||||
@@ -41,7 +41,6 @@
|
||||
/// );
|
||||
/// # }
|
||||
/// ```
|
||||
|
||||
#[macro_export]
|
||||
macro_rules! doc(
|
||||
() => {
|
||||
|
||||
@@ -1,4 +1,5 @@
|
||||
//! Tantivy can (if instructed to do so in the schema) store the term positions in a given field.
|
||||
//!
|
||||
//! This position is expressed as token ordinal. For instance,
|
||||
//! In "The beauty and the beast", the term "the" appears in position 0 and position 3.
|
||||
//! This information is useful to run phrase queries.
|
||||
@@ -38,7 +39,7 @@ pub use self::serializer::PositionSerializer;
|
||||
const COMPRESSION_BLOCK_SIZE: usize = BitPacker4x::BLOCK_LEN;
|
||||
|
||||
#[cfg(test)]
|
||||
pub mod tests {
|
||||
pub(crate) mod tests {
|
||||
|
||||
use std::iter;
|
||||
|
||||
|
||||
@@ -264,7 +264,7 @@ impl VIntDecoder for BlockDecoder {
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
pub mod tests {
|
||||
pub(crate) mod tests {
|
||||
|
||||
use super::*;
|
||||
use crate::TERMINATED;
|
||||
|
||||
155
src/postings/loaded_postings.rs
Normal file
155
src/postings/loaded_postings.rs
Normal file
@@ -0,0 +1,155 @@
|
||||
use crate::docset::{DocSet, TERMINATED};
|
||||
use crate::postings::{Postings, SegmentPostings};
|
||||
use crate::DocId;
|
||||
|
||||
/// `LoadedPostings` is a `DocSet` and `Postings` implementation.
|
||||
/// It is used to represent the postings of a term in memory.
|
||||
/// It is suitable if there are few documents for a term.
|
||||
///
|
||||
/// It exists mainly to reduce memory usage.
|
||||
/// `SegmentPostings` uses 1840 bytes per instance due to its caches.
|
||||
/// If you need to keep many terms around with few docs, it's cheaper to load all the
|
||||
/// postings in memory.
|
||||
///
|
||||
/// This is relevant for `RegexPhraseQuery`, which may have a lot of
|
||||
/// terms.
|
||||
/// E.g. 100_000 terms would need 184MB due to SegmentPostings.
|
||||
pub struct LoadedPostings {
|
||||
doc_ids: Box<[DocId]>,
|
||||
position_offsets: Box<[u32]>,
|
||||
positions: Box<[u32]>,
|
||||
cursor: usize,
|
||||
}
|
||||
|
||||
impl LoadedPostings {
|
||||
/// Creates a new `LoadedPostings` from a `SegmentPostings`.
|
||||
///
|
||||
/// It will also preload positions, if positions are available in the SegmentPostings.
|
||||
pub fn load(segment_postings: &mut SegmentPostings) -> LoadedPostings {
|
||||
let num_docs = segment_postings.doc_freq() as usize;
|
||||
let mut doc_ids = Vec::with_capacity(num_docs);
|
||||
let mut positions = Vec::with_capacity(num_docs);
|
||||
let mut position_offsets = Vec::with_capacity(num_docs);
|
||||
while segment_postings.doc() != TERMINATED {
|
||||
position_offsets.push(positions.len() as u32);
|
||||
doc_ids.push(segment_postings.doc());
|
||||
segment_postings.append_positions_with_offset(0, &mut positions);
|
||||
segment_postings.advance();
|
||||
}
|
||||
position_offsets.push(positions.len() as u32);
|
||||
LoadedPostings {
|
||||
doc_ids: doc_ids.into_boxed_slice(),
|
||||
positions: positions.into_boxed_slice(),
|
||||
position_offsets: position_offsets.into_boxed_slice(),
|
||||
cursor: 0,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
impl From<(Vec<DocId>, Vec<Vec<u32>>)> for LoadedPostings {
|
||||
fn from(doc_ids_and_positions: (Vec<DocId>, Vec<Vec<u32>>)) -> LoadedPostings {
|
||||
let mut position_offsets = Vec::new();
|
||||
let mut all_positions = Vec::new();
|
||||
let (doc_ids, docid_positions) = doc_ids_and_positions;
|
||||
for positions in docid_positions {
|
||||
position_offsets.push(all_positions.len() as u32);
|
||||
all_positions.extend_from_slice(&positions);
|
||||
}
|
||||
position_offsets.push(all_positions.len() as u32);
|
||||
LoadedPostings {
|
||||
doc_ids: doc_ids.into_boxed_slice(),
|
||||
positions: all_positions.into_boxed_slice(),
|
||||
position_offsets: position_offsets.into_boxed_slice(),
|
||||
cursor: 0,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl DocSet for LoadedPostings {
|
||||
fn advance(&mut self) -> DocId {
|
||||
self.cursor += 1;
|
||||
if self.cursor >= self.doc_ids.len() {
|
||||
self.cursor = self.doc_ids.len();
|
||||
return TERMINATED;
|
||||
}
|
||||
self.doc()
|
||||
}
|
||||
|
||||
fn doc(&self) -> DocId {
|
||||
if self.cursor >= self.doc_ids.len() {
|
||||
return TERMINATED;
|
||||
}
|
||||
self.doc_ids[self.cursor]
|
||||
}
|
||||
|
||||
fn size_hint(&self) -> u32 {
|
||||
self.doc_ids.len() as u32
|
||||
}
|
||||
}
|
||||
impl Postings for LoadedPostings {
|
||||
fn term_freq(&self) -> u32 {
|
||||
let start = self.position_offsets[self.cursor] as usize;
|
||||
let end = self.position_offsets[self.cursor + 1] as usize;
|
||||
(end - start) as u32
|
||||
}
|
||||
|
||||
fn append_positions_with_offset(&mut self, offset: u32, output: &mut Vec<u32>) {
|
||||
let start = self.position_offsets[self.cursor] as usize;
|
||||
let end = self.position_offsets[self.cursor + 1] as usize;
|
||||
for pos in &self.positions[start..end] {
|
||||
output.push(*pos + offset);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
pub(crate) mod tests {
|
||||
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
pub fn test_vec_postings() {
|
||||
let doc_ids: Vec<DocId> = (0u32..1024u32).map(|e| e * 3).collect();
|
||||
let mut postings = LoadedPostings::from((doc_ids, vec![]));
|
||||
assert_eq!(postings.doc(), 0u32);
|
||||
assert_eq!(postings.advance(), 3u32);
|
||||
assert_eq!(postings.doc(), 3u32);
|
||||
assert_eq!(postings.seek(14u32), 15u32);
|
||||
assert_eq!(postings.doc(), 15u32);
|
||||
assert_eq!(postings.seek(300u32), 300u32);
|
||||
assert_eq!(postings.doc(), 300u32);
|
||||
assert_eq!(postings.seek(6000u32), TERMINATED);
|
||||
}
|
||||
|
||||
#[test]
|
||||
pub fn test_vec_postings2() {
|
||||
let doc_ids: Vec<DocId> = (0u32..1024u32).map(|e| e * 3).collect();
|
||||
let mut positions = Vec::new();
|
||||
positions.resize(1024, Vec::new());
|
||||
positions[0] = vec![1u32, 2u32, 3u32];
|
||||
positions[1] = vec![30u32];
|
||||
positions[2] = vec![10u32];
|
||||
positions[4] = vec![50u32];
|
||||
let mut postings = LoadedPostings::from((doc_ids, positions));
|
||||
|
||||
let load = |postings: &mut LoadedPostings| {
|
||||
let mut loaded_positions = Vec::new();
|
||||
postings.positions(loaded_positions.as_mut());
|
||||
loaded_positions
|
||||
};
|
||||
assert_eq!(postings.doc(), 0u32);
|
||||
assert_eq!(load(&mut postings), vec![1u32, 2u32, 3u32]);
|
||||
|
||||
assert_eq!(postings.advance(), 3u32);
|
||||
assert_eq!(postings.doc(), 3u32);
|
||||
|
||||
assert_eq!(load(&mut postings), vec![30u32]);
|
||||
|
||||
assert_eq!(postings.seek(14u32), 15u32);
|
||||
assert_eq!(postings.doc(), 15u32);
|
||||
assert_eq!(postings.seek(300u32), 300u32);
|
||||
assert_eq!(postings.doc(), 300u32);
|
||||
assert_eq!(postings.seek(6000u32), TERMINATED);
|
||||
}
|
||||
}
|
||||
@@ -8,6 +8,7 @@ mod block_segment_postings;
|
||||
pub(crate) mod compression;
|
||||
mod indexing_context;
|
||||
mod json_postings_writer;
|
||||
mod loaded_postings;
|
||||
mod per_field_postings_writer;
|
||||
mod postings;
|
||||
mod postings_writer;
|
||||
@@ -17,6 +18,7 @@ mod serializer;
|
||||
mod skip;
|
||||
mod term_info;
|
||||
|
||||
pub(crate) use loaded_postings::LoadedPostings;
|
||||
pub(crate) use stacker::compute_table_memory_size;
|
||||
|
||||
pub use self::block_segment_postings::BlockSegmentPostings;
|
||||
@@ -29,7 +31,7 @@ pub use self::serializer::{FieldSerializer, InvertedIndexSerializer};
|
||||
pub(crate) use self::skip::{BlockInfo, SkipReader};
|
||||
pub use self::term_info::TermInfo;
|
||||
|
||||
#[allow(clippy::enum_variant_names)]
|
||||
#[expect(clippy::enum_variant_names)]
|
||||
#[derive(Debug, PartialEq, Clone, Copy, Eq)]
|
||||
pub(crate) enum FreqReadingOption {
|
||||
NoFreq,
|
||||
@@ -38,7 +40,7 @@ pub(crate) enum FreqReadingOption {
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
pub mod tests {
|
||||
pub(crate) mod tests {
|
||||
use std::mem;
|
||||
|
||||
use super::{InvertedIndexSerializer, Postings};
|
||||
|
||||
@@ -17,7 +17,14 @@ pub trait Postings: DocSet + 'static {
|
||||
/// Returns the positions offsetted with a given value.
|
||||
/// It is not necessary to clear the `output` before calling this method.
|
||||
/// The output vector will be resized to the `term_freq`.
|
||||
fn positions_with_offset(&mut self, offset: u32, output: &mut Vec<u32>);
|
||||
fn positions_with_offset(&mut self, offset: u32, output: &mut Vec<u32>) {
|
||||
output.clear();
|
||||
self.append_positions_with_offset(offset, output);
|
||||
}
|
||||
|
||||
/// Returns the positions offsetted with a given value.
|
||||
/// Data will be appended to the output.
|
||||
fn append_positions_with_offset(&mut self, offset: u32, output: &mut Vec<u32>);
|
||||
|
||||
/// Returns the positions of the term in the given document.
|
||||
/// The output vector will be resized to the `term_freq`.
|
||||
@@ -25,3 +32,13 @@ pub trait Postings: DocSet + 'static {
|
||||
self.positions_with_offset(0u32, output);
|
||||
}
|
||||
}
|
||||
|
||||
impl Postings for Box<dyn Postings> {
|
||||
fn term_freq(&self) -> u32 {
|
||||
(**self).term_freq()
|
||||
}
|
||||
|
||||
fn append_positions_with_offset(&mut self, offset: u32, output: &mut Vec<u32>) {
|
||||
(**self).append_positions_with_offset(offset, output);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -34,7 +34,7 @@ impl<'a> VInt32Reader<'a> {
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a> Iterator for VInt32Reader<'a> {
|
||||
impl Iterator for VInt32Reader<'_> {
|
||||
type Item = u32;
|
||||
|
||||
fn next(&mut self) -> Option<u32> {
|
||||
|
||||
@@ -237,8 +237,9 @@ impl Postings for SegmentPostings {
|
||||
self.block_cursor.freq(self.cur)
|
||||
}
|
||||
|
||||
fn positions_with_offset(&mut self, offset: u32, output: &mut Vec<u32>) {
|
||||
fn append_positions_with_offset(&mut self, offset: u32, output: &mut Vec<u32>) {
|
||||
let term_freq = self.term_freq();
|
||||
let prev_len = output.len();
|
||||
if let Some(position_reader) = self.position_reader.as_mut() {
|
||||
debug_assert!(
|
||||
!self.block_cursor.freqs().is_empty(),
|
||||
@@ -249,15 +250,14 @@ impl Postings for SegmentPostings {
|
||||
.iter()
|
||||
.cloned()
|
||||
.sum::<u32>() as u64);
|
||||
output.resize(term_freq as usize, 0u32);
|
||||
position_reader.read(read_offset, &mut output[..]);
|
||||
// TODO: instead of zeroing the output, we could use MaybeUninit or similar.
|
||||
output.resize(prev_len + term_freq as usize, 0u32);
|
||||
position_reader.read(read_offset, &mut output[prev_len..]);
|
||||
let mut cum = offset;
|
||||
for output_mut in output.iter_mut() {
|
||||
for output_mut in output[prev_len..].iter_mut() {
|
||||
cum += *output_mut;
|
||||
*output_mut = cum;
|
||||
}
|
||||
} else {
|
||||
output.clear();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -6,6 +6,7 @@ use tantivy_fst::Automaton;
|
||||
|
||||
use super::phrase_prefix_query::prefix_end;
|
||||
use crate::index::SegmentReader;
|
||||
use crate::postings::TermInfo;
|
||||
use crate::query::{BitSetDocSet, ConstScorer, Explanation, Scorer, Weight};
|
||||
use crate::schema::{Field, IndexRecordOption};
|
||||
use crate::termdict::{TermDictionary, TermStreamer};
|
||||
@@ -64,6 +65,18 @@ where
|
||||
|
||||
term_stream_builder.into_stream()
|
||||
}
|
||||
|
||||
/// Returns the term infos that match the automaton
|
||||
pub fn get_match_term_infos(&self, reader: &SegmentReader) -> crate::Result<Vec<TermInfo>> {
|
||||
let inverted_index = reader.inverted_index(self.field)?;
|
||||
let term_dict = inverted_index.terms();
|
||||
let mut term_stream = self.automaton_stream(term_dict)?;
|
||||
let mut term_infos = Vec::new();
|
||||
while term_stream.advance() {
|
||||
term_infos.push(term_stream.value().clone());
|
||||
}
|
||||
Ok(term_infos)
|
||||
}
|
||||
}
|
||||
|
||||
impl<A> Weight for AutomatonWeight<A>
|
||||
|
||||
@@ -272,7 +272,7 @@ impl<'a> From<&'a mut TermScorer> for TermScorerWithMaxScore<'a> {
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a> Deref for TermScorerWithMaxScore<'a> {
|
||||
impl Deref for TermScorerWithMaxScore<'_> {
|
||||
type Target = TermScorer;
|
||||
|
||||
fn deref(&self) -> &Self::Target {
|
||||
@@ -280,7 +280,7 @@ impl<'a> Deref for TermScorerWithMaxScore<'a> {
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a> DerefMut for TermScorerWithMaxScore<'a> {
|
||||
impl DerefMut for TermScorerWithMaxScore<'_> {
|
||||
fn deref_mut(&mut self) -> &mut Self::Target {
|
||||
self.scorer
|
||||
}
|
||||
@@ -308,7 +308,7 @@ mod tests {
|
||||
|
||||
use crate::query::score_combiner::SumCombiner;
|
||||
use crate::query::term_query::TermScorer;
|
||||
use crate::query::{Bm25Weight, Scorer, Union};
|
||||
use crate::query::{Bm25Weight, BufferedUnionScorer, Scorer};
|
||||
use crate::{DocId, DocSet, Score, TERMINATED};
|
||||
|
||||
struct Float(Score);
|
||||
@@ -371,7 +371,7 @@ mod tests {
|
||||
fn compute_checkpoints_manual(term_scorers: Vec<TermScorer>, n: usize) -> Vec<(DocId, Score)> {
|
||||
let mut heap: BinaryHeap<Float> = BinaryHeap::with_capacity(n);
|
||||
let mut checkpoints: Vec<(DocId, Score)> = Vec::new();
|
||||
let mut scorer = Union::build(term_scorers, SumCombiner::default);
|
||||
let mut scorer = BufferedUnionScorer::build(term_scorers, SumCombiner::default);
|
||||
|
||||
let mut limit = Score::MIN;
|
||||
loop {
|
||||
@@ -417,7 +417,7 @@ mod tests {
|
||||
.boxed()
|
||||
}
|
||||
|
||||
#[allow(clippy::type_complexity)]
|
||||
#[expect(clippy::type_complexity)]
|
||||
fn gen_term_scorers(num_scorers: usize) -> BoxedStrategy<(Vec<Vec<(DocId, u32)>>, Vec<u32>)> {
|
||||
(1u32..100u32)
|
||||
.prop_flat_map(move |max_doc: u32| {
|
||||
|
||||
@@ -9,8 +9,8 @@ use crate::query::score_combiner::{DoNothingCombiner, ScoreCombiner};
|
||||
use crate::query::term_query::TermScorer;
|
||||
use crate::query::weight::{for_each_docset_buffered, for_each_pruning_scorer, for_each_scorer};
|
||||
use crate::query::{
|
||||
intersect_scorers, EmptyScorer, Exclude, Explanation, Occur, RequiredOptionalScorer, Scorer,
|
||||
Union, Weight,
|
||||
intersect_scorers, BufferedUnionScorer, EmptyScorer, Exclude, Explanation, Occur,
|
||||
RequiredOptionalScorer, Scorer, Weight,
|
||||
};
|
||||
use crate::{DocId, Score};
|
||||
|
||||
@@ -65,14 +65,17 @@ where
|
||||
// Block wand is only available if we read frequencies.
|
||||
return SpecializedScorer::TermUnion(scorers);
|
||||
} else {
|
||||
return SpecializedScorer::Other(Box::new(Union::build(
|
||||
return SpecializedScorer::Other(Box::new(BufferedUnionScorer::build(
|
||||
scorers,
|
||||
score_combiner_fn,
|
||||
)));
|
||||
}
|
||||
}
|
||||
}
|
||||
SpecializedScorer::Other(Box::new(Union::build(scorers, score_combiner_fn)))
|
||||
SpecializedScorer::Other(Box::new(BufferedUnionScorer::build(
|
||||
scorers,
|
||||
score_combiner_fn,
|
||||
)))
|
||||
}
|
||||
|
||||
fn into_box_scorer<TScoreCombiner: ScoreCombiner>(
|
||||
@@ -81,7 +84,7 @@ fn into_box_scorer<TScoreCombiner: ScoreCombiner>(
|
||||
) -> Box<dyn Scorer> {
|
||||
match scorer {
|
||||
SpecializedScorer::TermUnion(term_scorers) => {
|
||||
let union_scorer = Union::build(term_scorers, score_combiner_fn);
|
||||
let union_scorer = BufferedUnionScorer::build(term_scorers, score_combiner_fn);
|
||||
Box::new(union_scorer)
|
||||
}
|
||||
SpecializedScorer::Other(scorer) => scorer,
|
||||
@@ -296,7 +299,8 @@ impl<TScoreCombiner: ScoreCombiner + Sync> Weight for BooleanWeight<TScoreCombin
|
||||
let scorer = self.complex_scorer(reader, 1.0, &self.score_combiner_fn)?;
|
||||
match scorer {
|
||||
SpecializedScorer::TermUnion(term_scorers) => {
|
||||
let mut union_scorer = Union::build(term_scorers, &self.score_combiner_fn);
|
||||
let mut union_scorer =
|
||||
BufferedUnionScorer::build(term_scorers, &self.score_combiner_fn);
|
||||
for_each_scorer(&mut union_scorer, callback);
|
||||
}
|
||||
SpecializedScorer::Other(mut scorer) => {
|
||||
@@ -316,7 +320,8 @@ impl<TScoreCombiner: ScoreCombiner + Sync> Weight for BooleanWeight<TScoreCombin
|
||||
|
||||
match scorer {
|
||||
SpecializedScorer::TermUnion(term_scorers) => {
|
||||
let mut union_scorer = Union::build(term_scorers, &self.score_combiner_fn);
|
||||
let mut union_scorer =
|
||||
BufferedUnionScorer::build(term_scorers, &self.score_combiner_fn);
|
||||
for_each_docset_buffered(&mut union_scorer, &mut buffer, callback);
|
||||
}
|
||||
SpecializedScorer::Other(mut scorer) => {
|
||||
|
||||
@@ -51,6 +51,7 @@ pub use self::fuzzy_query::FuzzyTermQuery;
|
||||
pub use self::intersection::{intersect_scorers, Intersection};
|
||||
pub use self::more_like_this::{MoreLikeThisQuery, MoreLikeThisQueryBuilder};
|
||||
pub use self::phrase_prefix_query::PhrasePrefixQuery;
|
||||
pub use self::phrase_query::regex_phrase_query::{wildcard_query_to_regex_str, RegexPhraseQuery};
|
||||
pub use self::phrase_query::PhraseQuery;
|
||||
pub use self::query::{EnableScoring, Query, QueryClone};
|
||||
pub use self::query_parser::{QueryParser, QueryParserError};
|
||||
@@ -61,7 +62,7 @@ pub use self::score_combiner::{DisjunctionMaxCombiner, ScoreCombiner, SumCombine
|
||||
pub use self::scorer::Scorer;
|
||||
pub use self::set_query::TermSetQuery;
|
||||
pub use self::term_query::TermQuery;
|
||||
pub use self::union::Union;
|
||||
pub use self::union::BufferedUnionScorer;
|
||||
#[cfg(test)]
|
||||
pub use self::vec_docset::VecDocSet;
|
||||
pub use self::weight::Weight;
|
||||
|
||||
@@ -8,7 +8,7 @@ use crate::{DocId, Score};
|
||||
|
||||
// MultiPrefix is the larger variant, and also the one we expect most often. PhraseScorer is > 1kB
|
||||
// though, it would be interesting to slim it down if possible.
|
||||
#[allow(clippy::large_enum_variant)]
|
||||
#[expect(clippy::large_enum_variant)]
|
||||
enum PhraseKind<TPostings: Postings> {
|
||||
SinglePrefix {
|
||||
position_offset: u32,
|
||||
|
||||
@@ -53,27 +53,14 @@ impl PhrasePrefixWeight {
|
||||
.map(|similarity_weight| similarity_weight.boost_by(boost));
|
||||
let fieldnorm_reader = self.fieldnorm_reader(reader)?;
|
||||
let mut term_postings_list = Vec::new();
|
||||
if reader.has_deletes() {
|
||||
for &(offset, ref term) in &self.phrase_terms {
|
||||
if let Some(postings) = reader
|
||||
.inverted_index(term.field())?
|
||||
.read_postings(term, IndexRecordOption::WithFreqsAndPositions)?
|
||||
{
|
||||
term_postings_list.push((offset, postings));
|
||||
} else {
|
||||
return Ok(None);
|
||||
}
|
||||
}
|
||||
} else {
|
||||
for &(offset, ref term) in &self.phrase_terms {
|
||||
if let Some(postings) = reader
|
||||
.inverted_index(term.field())?
|
||||
.read_postings_no_deletes(term, IndexRecordOption::WithFreqsAndPositions)?
|
||||
{
|
||||
term_postings_list.push((offset, postings));
|
||||
} else {
|
||||
return Ok(None);
|
||||
}
|
||||
for &(offset, ref term) in &self.phrase_terms {
|
||||
if let Some(postings) = reader
|
||||
.inverted_index(term.field())?
|
||||
.read_postings(term, IndexRecordOption::WithFreqsAndPositions)?
|
||||
{
|
||||
term_postings_list.push((offset, postings));
|
||||
} else {
|
||||
return Ok(None);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -109,8 +96,8 @@ impl PhrasePrefixWeight {
|
||||
{
|
||||
suffixes.push(postings);
|
||||
}
|
||||
} else if let Some(postings) = inv_index
|
||||
.read_postings_no_deletes(&new_term, IndexRecordOption::WithFreqsAndPositions)?
|
||||
} else if let Some(postings) =
|
||||
inv_index.read_postings(&new_term, IndexRecordOption::WithFreqsAndPositions)?
|
||||
{
|
||||
suffixes.push(postings);
|
||||
}
|
||||
|
||||
@@ -1,6 +1,8 @@
|
||||
mod phrase_query;
|
||||
mod phrase_scorer;
|
||||
mod phrase_weight;
|
||||
pub mod regex_phrase_query;
|
||||
mod regex_phrase_weight;
|
||||
|
||||
pub use self::phrase_query::PhraseQuery;
|
||||
pub(crate) use self::phrase_scorer::intersection_count;
|
||||
@@ -8,7 +10,7 @@ pub use self::phrase_scorer::PhraseScorer;
|
||||
pub use self::phrase_weight::PhraseWeight;
|
||||
|
||||
#[cfg(test)]
|
||||
pub mod tests {
|
||||
pub(crate) mod tests {
|
||||
|
||||
use serde_json::json;
|
||||
|
||||
@@ -19,15 +21,15 @@ pub mod tests {
|
||||
use crate::schema::{Schema, Term, TEXT};
|
||||
use crate::{assert_nearly_equals, DocAddress, DocId, IndexWriter, TERMINATED};
|
||||
|
||||
pub fn create_index(texts: &[&'static str]) -> crate::Result<Index> {
|
||||
pub fn create_index<S: AsRef<str>>(texts: &[S]) -> crate::Result<Index> {
|
||||
let mut schema_builder = Schema::builder();
|
||||
let text_field = schema_builder.add_text_field("text", TEXT);
|
||||
let schema = schema_builder.build();
|
||||
let index = Index::create_in_ram(schema);
|
||||
{
|
||||
let mut index_writer: IndexWriter = index.writer_for_tests()?;
|
||||
for &text in texts {
|
||||
let doc = doc!(text_field=>text);
|
||||
for text in texts {
|
||||
let doc = doc!(text_field=>text.as_ref());
|
||||
index_writer.add_document(doc)?;
|
||||
}
|
||||
index_writer.commit()?;
|
||||
|
||||
@@ -50,27 +50,14 @@ impl PhraseWeight {
|
||||
.map(|similarity_weight| similarity_weight.boost_by(boost));
|
||||
let fieldnorm_reader = self.fieldnorm_reader(reader)?;
|
||||
let mut term_postings_list = Vec::new();
|
||||
if reader.has_deletes() {
|
||||
for &(offset, ref term) in &self.phrase_terms {
|
||||
if let Some(postings) = reader
|
||||
.inverted_index(term.field())?
|
||||
.read_postings(term, IndexRecordOption::WithFreqsAndPositions)?
|
||||
{
|
||||
term_postings_list.push((offset, postings));
|
||||
} else {
|
||||
return Ok(None);
|
||||
}
|
||||
}
|
||||
} else {
|
||||
for &(offset, ref term) in &self.phrase_terms {
|
||||
if let Some(postings) = reader
|
||||
.inverted_index(term.field())?
|
||||
.read_postings_no_deletes(term, IndexRecordOption::WithFreqsAndPositions)?
|
||||
{
|
||||
term_postings_list.push((offset, postings));
|
||||
} else {
|
||||
return Ok(None);
|
||||
}
|
||||
for &(offset, ref term) in &self.phrase_terms {
|
||||
if let Some(postings) = reader
|
||||
.inverted_index(term.field())?
|
||||
.read_postings(term, IndexRecordOption::WithFreqsAndPositions)?
|
||||
{
|
||||
term_postings_list.push((offset, postings));
|
||||
} else {
|
||||
return Ok(None);
|
||||
}
|
||||
}
|
||||
Ok(Some(PhraseScorer::new(
|
||||
|
||||
172
src/query/phrase_query/regex_phrase_query.rs
Normal file
172
src/query/phrase_query/regex_phrase_query.rs
Normal file
@@ -0,0 +1,172 @@
|
||||
use super::regex_phrase_weight::RegexPhraseWeight;
|
||||
use crate::query::bm25::Bm25Weight;
|
||||
use crate::query::{EnableScoring, Query, Weight};
|
||||
use crate::schema::{Field, IndexRecordOption, Term, Type};
|
||||
|
||||
/// `RegexPhraseQuery` matches a specific sequence of regex queries.
|
||||
///
|
||||
/// For instance, the phrase query for `"pa.* time"` will match
|
||||
/// the sentence:
|
||||
///
|
||||
/// **Alan just got a part time job.**
|
||||
///
|
||||
/// On the other hand it will not match the sentence.
|
||||
///
|
||||
/// **This is my favorite part of the job.**
|
||||
///
|
||||
/// [Slop](RegexPhraseQuery::set_slop) allows leniency in term proximity
|
||||
/// for some performance trade-off.
|
||||
///
|
||||
/// Using a `RegexPhraseQuery` on a field requires positions
|
||||
/// to be indexed for this field.
|
||||
#[derive(Clone, Debug)]
|
||||
pub struct RegexPhraseQuery {
|
||||
field: Field,
|
||||
phrase_terms: Vec<(usize, String)>,
|
||||
slop: u32,
|
||||
max_expansions: u32,
|
||||
}
|
||||
|
||||
/// Transform a wildcard query to a regex string.
|
||||
///
|
||||
/// `AB*CD` for example is converted to `AB.*CD`
|
||||
///
|
||||
/// All other chars are regex escaped.
|
||||
pub fn wildcard_query_to_regex_str(term: &str) -> String {
|
||||
regex::escape(term).replace(r"\*", ".*")
|
||||
}
|
||||
|
||||
impl RegexPhraseQuery {
|
||||
/// Creates a new `RegexPhraseQuery` given a list of terms.
|
||||
///
|
||||
/// There must be at least two terms, and all terms
|
||||
/// must belong to the same field.
|
||||
///
|
||||
/// Offset for each term will be same as index in the Vector
|
||||
pub fn new(field: Field, terms: Vec<String>) -> RegexPhraseQuery {
|
||||
let terms_with_offset = terms.into_iter().enumerate().collect();
|
||||
RegexPhraseQuery::new_with_offset(field, terms_with_offset)
|
||||
}
|
||||
|
||||
/// Creates a new `RegexPhraseQuery` given a list of terms and their offsets.
|
||||
///
|
||||
/// Can be used to provide custom offset for each term.
|
||||
pub fn new_with_offset(field: Field, terms: Vec<(usize, String)>) -> RegexPhraseQuery {
|
||||
RegexPhraseQuery::new_with_offset_and_slop(field, terms, 0)
|
||||
}
|
||||
|
||||
/// Creates a new `RegexPhraseQuery` given a list of terms, their offsets and a slop
|
||||
pub fn new_with_offset_and_slop(
|
||||
field: Field,
|
||||
mut terms: Vec<(usize, String)>,
|
||||
slop: u32,
|
||||
) -> RegexPhraseQuery {
|
||||
assert!(
|
||||
terms.len() > 1,
|
||||
"A phrase query is required to have strictly more than one term."
|
||||
);
|
||||
terms.sort_by_key(|&(offset, _)| offset);
|
||||
RegexPhraseQuery {
|
||||
field,
|
||||
phrase_terms: terms,
|
||||
slop,
|
||||
max_expansions: 1 << 14,
|
||||
}
|
||||
}
|
||||
|
||||
/// Slop allowed for the phrase.
|
||||
///
|
||||
/// The query will match if its terms are separated by `slop` terms at most.
|
||||
/// The slop can be considered a budget between all terms.
|
||||
/// E.g. "A B C" with slop 1 allows "A X B C", "A B X C", but not "A X B X C".
|
||||
///
|
||||
/// Transposition costs 2, e.g. "A B" with slop 1 will not match "B A" but it would with slop 2
|
||||
/// Transposition is not a special case, in the example above A is moved 1 position and B is
|
||||
/// moved 1 position, so the slop is 2.
|
||||
///
|
||||
/// As a result slop works in both directions, so the order of the terms may changed as long as
|
||||
/// they respect the slop.
|
||||
///
|
||||
/// By default the slop is 0 meaning query terms need to be adjacent.
|
||||
pub fn set_slop(&mut self, value: u32) {
|
||||
self.slop = value;
|
||||
}
|
||||
|
||||
/// Sets the max expansions a regex term can match. The limit will be over all terms.
|
||||
/// After the limit is hit an error will be returned.
|
||||
pub fn set_max_expansions(&mut self, value: u32) {
|
||||
self.max_expansions = value;
|
||||
}
|
||||
|
||||
/// The [`Field`] this `RegexPhraseQuery` is targeting.
|
||||
pub fn field(&self) -> Field {
|
||||
self.field
|
||||
}
|
||||
|
||||
/// `Term`s in the phrase without the associated offsets.
|
||||
pub fn phrase_terms(&self) -> Vec<Term> {
|
||||
self.phrase_terms
|
||||
.iter()
|
||||
.map(|(_, term)| Term::from_field_text(self.field, term))
|
||||
.collect::<Vec<Term>>()
|
||||
}
|
||||
|
||||
/// Returns the [`RegexPhraseWeight`] for the given phrase query given a specific `searcher`.
|
||||
///
|
||||
/// This function is the same as [`Query::weight()`] except it returns
|
||||
/// a specialized type [`RegexPhraseWeight`] instead of a Boxed trait.
|
||||
pub(crate) fn regex_phrase_weight(
|
||||
&self,
|
||||
enable_scoring: EnableScoring<'_>,
|
||||
) -> crate::Result<RegexPhraseWeight> {
|
||||
let schema = enable_scoring.schema();
|
||||
let field_type = schema.get_field_entry(self.field).field_type().value_type();
|
||||
if field_type != Type::Str {
|
||||
return Err(crate::TantivyError::SchemaError(format!(
|
||||
"RegexPhraseQuery can only be used with a field of type text currently, but got \
|
||||
{:?}",
|
||||
field_type
|
||||
)));
|
||||
}
|
||||
|
||||
let field_entry = schema.get_field_entry(self.field);
|
||||
let has_positions = field_entry
|
||||
.field_type()
|
||||
.get_index_record_option()
|
||||
.map(IndexRecordOption::has_positions)
|
||||
.unwrap_or(false);
|
||||
if !has_positions {
|
||||
let field_name = field_entry.name();
|
||||
return Err(crate::TantivyError::SchemaError(format!(
|
||||
"Applied phrase query on field {field_name:?}, which does not have positions \
|
||||
indexed"
|
||||
)));
|
||||
}
|
||||
let terms = self.phrase_terms();
|
||||
let bm25_weight_opt = match enable_scoring {
|
||||
EnableScoring::Enabled {
|
||||
statistics_provider,
|
||||
..
|
||||
} => Some(Bm25Weight::for_terms(statistics_provider, &terms)?),
|
||||
EnableScoring::Disabled { .. } => None,
|
||||
};
|
||||
let weight = RegexPhraseWeight::new(
|
||||
self.field,
|
||||
self.phrase_terms.clone(),
|
||||
bm25_weight_opt,
|
||||
self.max_expansions,
|
||||
self.slop,
|
||||
);
|
||||
Ok(weight)
|
||||
}
|
||||
}
|
||||
|
||||
impl Query for RegexPhraseQuery {
|
||||
/// Create the weight associated with a query.
|
||||
///
|
||||
/// See [`Weight`].
|
||||
fn weight(&self, enable_scoring: EnableScoring<'_>) -> crate::Result<Box<dyn Weight>> {
|
||||
let phrase_weight = self.regex_phrase_weight(enable_scoring)?;
|
||||
Ok(Box::new(phrase_weight))
|
||||
}
|
||||
}
|
||||
475
src/query/phrase_query/regex_phrase_weight.rs
Normal file
475
src/query/phrase_query/regex_phrase_weight.rs
Normal file
@@ -0,0 +1,475 @@
|
||||
use std::sync::Arc;
|
||||
|
||||
use common::BitSet;
|
||||
use tantivy_fst::Regex;
|
||||
|
||||
use super::PhraseScorer;
|
||||
use crate::fieldnorm::FieldNormReader;
|
||||
use crate::index::SegmentReader;
|
||||
use crate::postings::{LoadedPostings, Postings, SegmentPostings, TermInfo};
|
||||
use crate::query::bm25::Bm25Weight;
|
||||
use crate::query::explanation::does_not_match;
|
||||
use crate::query::union::{BitSetPostingUnion, SimpleUnion};
|
||||
use crate::query::{AutomatonWeight, BitSetDocSet, EmptyScorer, Explanation, Scorer, Weight};
|
||||
use crate::schema::{Field, IndexRecordOption};
|
||||
use crate::{DocId, DocSet, InvertedIndexReader, Score};
|
||||
|
||||
type UnionType = SimpleUnion<Box<dyn Postings + 'static>>;
|
||||
|
||||
/// The `RegexPhraseWeight` is the weight associated to a regex phrase query.
|
||||
/// See RegexPhraseWeight::get_union_from_term_infos for some design decisions.
|
||||
pub struct RegexPhraseWeight {
|
||||
field: Field,
|
||||
phrase_terms: Vec<(usize, String)>,
|
||||
similarity_weight_opt: Option<Bm25Weight>,
|
||||
slop: u32,
|
||||
max_expansions: u32,
|
||||
}
|
||||
|
||||
impl RegexPhraseWeight {
|
||||
/// Creates a new phrase weight.
|
||||
/// If `similarity_weight_opt` is None, then scoring is disabled
|
||||
pub fn new(
|
||||
field: Field,
|
||||
phrase_terms: Vec<(usize, String)>,
|
||||
similarity_weight_opt: Option<Bm25Weight>,
|
||||
max_expansions: u32,
|
||||
slop: u32,
|
||||
) -> RegexPhraseWeight {
|
||||
RegexPhraseWeight {
|
||||
field,
|
||||
phrase_terms,
|
||||
similarity_weight_opt,
|
||||
slop,
|
||||
max_expansions,
|
||||
}
|
||||
}
|
||||
|
||||
fn fieldnorm_reader(&self, reader: &SegmentReader) -> crate::Result<FieldNormReader> {
|
||||
if self.similarity_weight_opt.is_some() {
|
||||
if let Some(fieldnorm_reader) = reader.fieldnorms_readers().get_field(self.field)? {
|
||||
return Ok(fieldnorm_reader);
|
||||
}
|
||||
}
|
||||
Ok(FieldNormReader::constant(reader.max_doc(), 1))
|
||||
}
|
||||
|
||||
pub(crate) fn phrase_scorer(
|
||||
&self,
|
||||
reader: &SegmentReader,
|
||||
boost: Score,
|
||||
) -> crate::Result<Option<PhraseScorer<UnionType>>> {
|
||||
let similarity_weight_opt = self
|
||||
.similarity_weight_opt
|
||||
.as_ref()
|
||||
.map(|similarity_weight| similarity_weight.boost_by(boost));
|
||||
let fieldnorm_reader = self.fieldnorm_reader(reader)?;
|
||||
let mut posting_lists = Vec::new();
|
||||
let inverted_index = reader.inverted_index(self.field)?;
|
||||
let mut num_terms = 0;
|
||||
for &(offset, ref term) in &self.phrase_terms {
|
||||
let regex = Regex::new(term)
|
||||
.map_err(|e| crate::TantivyError::InvalidArgument(format!("Invalid regex: {e}")))?;
|
||||
|
||||
let automaton: AutomatonWeight<Regex> =
|
||||
AutomatonWeight::new(self.field, Arc::new(regex));
|
||||
let term_infos = automaton.get_match_term_infos(reader)?;
|
||||
// If term_infos is empty, the phrase can not match any documents.
|
||||
if term_infos.is_empty() {
|
||||
return Ok(None);
|
||||
}
|
||||
num_terms += term_infos.len();
|
||||
if num_terms > self.max_expansions as usize {
|
||||
return Err(crate::TantivyError::InvalidArgument(format!(
|
||||
"Phrase query exceeded max expansions {}",
|
||||
num_terms
|
||||
)));
|
||||
}
|
||||
let union = Self::get_union_from_term_infos(&term_infos, reader, &inverted_index)?;
|
||||
|
||||
posting_lists.push((offset, union));
|
||||
}
|
||||
|
||||
Ok(Some(PhraseScorer::new(
|
||||
posting_lists,
|
||||
similarity_weight_opt,
|
||||
fieldnorm_reader,
|
||||
self.slop,
|
||||
)))
|
||||
}
|
||||
|
||||
/// Add all docs of the term to the docset
|
||||
fn add_to_bitset(
|
||||
inverted_index: &InvertedIndexReader,
|
||||
term_info: &TermInfo,
|
||||
doc_bitset: &mut BitSet,
|
||||
) -> crate::Result<()> {
|
||||
let mut block_segment_postings = inverted_index
|
||||
.read_block_postings_from_terminfo(term_info, IndexRecordOption::Basic)?;
|
||||
loop {
|
||||
let docs = block_segment_postings.docs();
|
||||
if docs.is_empty() {
|
||||
break;
|
||||
}
|
||||
for &doc in docs {
|
||||
doc_bitset.insert(doc);
|
||||
}
|
||||
block_segment_postings.advance();
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// This function generates a union of document sets from multiple term information
|
||||
/// (`TermInfo`).
|
||||
///
|
||||
/// It uses bucketing based on term frequency to optimize query performance and memory usage.
|
||||
/// The terms are divided into buckets based on their document frequency (the number of
|
||||
/// documents they appear in).
|
||||
///
|
||||
/// ### Bucketing Strategy:
|
||||
/// Once a bucket contains more than 512 terms, it is moved to the end of the list and replaced
|
||||
/// with a new empty bucket.
|
||||
///
|
||||
/// - **Sparse Term Buckets**: Terms with document frequency `< 100`.
|
||||
///
|
||||
/// Each sparse bucket contains:
|
||||
/// - A `BitSet` to efficiently track which document IDs are present in the bucket, which is
|
||||
/// used to drive the `DocSet`.
|
||||
/// - A `Vec<LoadedPostings>` to store the postings for each term in that bucket.
|
||||
///
|
||||
/// - **Other Term Buckets**:
|
||||
/// - **Bucket 0**: Terms appearing in less than `0.1%` of documents.
|
||||
/// - **Bucket 1**: Terms appearing in `0.1%` to `1%` of documents.
|
||||
/// - **Bucket 2**: Terms appearing in `1%` to `10%` of documents.
|
||||
/// - **Bucket 3**: Terms appearing in more than `10%` of documents.
|
||||
///
|
||||
/// Each bucket contains:
|
||||
/// - A `BitSet` to efficiently track which document IDs are present in the bucket.
|
||||
/// - A `Vec<SegmentPostings>` to store the postings for each term in that bucket.
|
||||
///
|
||||
/// ### Design Choices:
|
||||
/// The main cost for a _unbucketed_ regex phrase query with a medium/high amount of terms is
|
||||
/// the `append_positions_with_offset` from `Postings`.
|
||||
/// We don't know which docsets hit, so we need to scan all of them to check if they contain the
|
||||
/// docid.
|
||||
/// The bucketing strategy groups less common DocSets together, so we can rule out the
|
||||
/// whole docset group in many cases.
|
||||
///
|
||||
/// E.g. consider the phrase "th* world"
|
||||
/// It contains the term "the", which may occur in almost all documents.
|
||||
/// It may also contain 10_000s very rare terms like "theologian".
|
||||
///
|
||||
/// For very low-frequency terms (sparse terms), we use `LoadedPostings` and aggregate
|
||||
/// their document IDs into a `BitSet`, which is more memory-efficient than using
|
||||
/// `SegmentPostings`. E.g. 100_000 terms with SegmentPostings would consume 184MB.
|
||||
/// `SegmentPostings` uses memory equivalent to 460 docids. The 100 docs limit should be
|
||||
/// fine as long as a term doesn't have too many positions per doc.
|
||||
///
|
||||
/// ### Future Optimization:
|
||||
/// A larger performance improvement would be an additional partitioning of the space
|
||||
/// vertically of u16::MAX blocks, where we mark which docset ord has values in each block.
|
||||
/// E.g. partitioning in a index with 5 million documents this would reduce the number of
|
||||
/// docsets to scan to around 1/20 in the sparse term bucket where the terms only have a few
|
||||
/// docs. For higher cardinality buckets this is irrelevant as they are in most blocks.
|
||||
///
|
||||
/// Use Roaring Bitmaps for sparse terms. The full bitvec is main memory consumer currently.
|
||||
pub(crate) fn get_union_from_term_infos(
|
||||
term_infos: &[TermInfo],
|
||||
reader: &SegmentReader,
|
||||
inverted_index: &InvertedIndexReader,
|
||||
) -> crate::Result<UnionType> {
|
||||
let max_doc = reader.max_doc();
|
||||
|
||||
// Buckets for sparse terms
|
||||
let mut sparse_buckets: Vec<(BitSet, Vec<LoadedPostings>)> =
|
||||
vec![(BitSet::with_max_value(max_doc), Vec::new())];
|
||||
|
||||
// Buckets for other terms based on document frequency percentages:
|
||||
// - Bucket 0: Terms appearing in less than 0.1% of documents
|
||||
// - Bucket 1: Terms appearing in 0.1% to 1% of documents
|
||||
// - Bucket 2: Terms appearing in 1% to 10% of documents
|
||||
// - Bucket 3: Terms appearing in more than 10% of documents
|
||||
let mut buckets: Vec<(BitSet, Vec<SegmentPostings>)> = (0..4)
|
||||
.map(|_| (BitSet::with_max_value(max_doc), Vec::new()))
|
||||
.collect();
|
||||
|
||||
const SPARSE_TERM_DOC_THRESHOLD: u32 = 100;
|
||||
|
||||
for term_info in term_infos {
|
||||
let mut term_posting = inverted_index
|
||||
.read_postings_from_terminfo(term_info, IndexRecordOption::WithFreqsAndPositions)?;
|
||||
let num_docs = term_posting.doc_freq();
|
||||
|
||||
if num_docs < SPARSE_TERM_DOC_THRESHOLD {
|
||||
let current_bucket = &mut sparse_buckets[0];
|
||||
Self::add_to_bitset(inverted_index, term_info, &mut current_bucket.0)?;
|
||||
let docset = LoadedPostings::load(&mut term_posting);
|
||||
current_bucket.1.push(docset);
|
||||
|
||||
// Move the bucket to the end if the term limit is reached
|
||||
if current_bucket.1.len() == 512 {
|
||||
sparse_buckets.push((BitSet::with_max_value(max_doc), Vec::new()));
|
||||
let end_index = sparse_buckets.len() - 1;
|
||||
sparse_buckets.swap(0, end_index);
|
||||
}
|
||||
} else {
|
||||
// Calculate the percentage of documents the term appears in
|
||||
let doc_freq_percentage = (num_docs as f32) / (max_doc as f32) * 100.0;
|
||||
|
||||
// Determine the appropriate bucket based on percentage thresholds
|
||||
let bucket_index = if doc_freq_percentage < 0.1 {
|
||||
0
|
||||
} else if doc_freq_percentage < 1.0 {
|
||||
1
|
||||
} else if doc_freq_percentage < 10.0 {
|
||||
2
|
||||
} else {
|
||||
3
|
||||
};
|
||||
let bucket = &mut buckets[bucket_index];
|
||||
|
||||
// Add term postings to the appropriate bucket
|
||||
Self::add_to_bitset(inverted_index, term_info, &mut bucket.0)?;
|
||||
bucket.1.push(term_posting);
|
||||
|
||||
// Move the bucket to the end if the term limit is reached
|
||||
if bucket.1.len() == 512 {
|
||||
buckets.push((BitSet::with_max_value(max_doc), Vec::new()));
|
||||
let end_index = buckets.len() - 1;
|
||||
buckets.swap(bucket_index, end_index);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Build unions for sparse term buckets
|
||||
let sparse_term_docsets: Vec<_> = sparse_buckets
|
||||
.into_iter()
|
||||
.filter(|(_, postings)| !postings.is_empty())
|
||||
.map(|(bitset, postings)| {
|
||||
BitSetPostingUnion::build(postings, BitSetDocSet::from(bitset))
|
||||
})
|
||||
.collect();
|
||||
let sparse_term_unions = SimpleUnion::build(sparse_term_docsets);
|
||||
|
||||
// Build unions for other term buckets
|
||||
let bitset_unions_per_bucket: Vec<_> = buckets
|
||||
.into_iter()
|
||||
.filter(|(_, postings)| !postings.is_empty())
|
||||
.map(|(bitset, postings)| {
|
||||
BitSetPostingUnion::build(postings, BitSetDocSet::from(bitset))
|
||||
})
|
||||
.collect();
|
||||
let other_union = SimpleUnion::build(bitset_unions_per_bucket);
|
||||
|
||||
let union: SimpleUnion<Box<dyn Postings + 'static>> =
|
||||
SimpleUnion::build(vec![Box::new(sparse_term_unions), Box::new(other_union)]);
|
||||
|
||||
// Return a union of sparse term unions and other term unions
|
||||
Ok(union)
|
||||
}
|
||||
}
|
||||
|
||||
impl Weight for RegexPhraseWeight {
|
||||
fn scorer(&self, reader: &SegmentReader, boost: Score) -> crate::Result<Box<dyn Scorer>> {
|
||||
if let Some(scorer) = self.phrase_scorer(reader, boost)? {
|
||||
Ok(Box::new(scorer))
|
||||
} else {
|
||||
Ok(Box::new(EmptyScorer))
|
||||
}
|
||||
}
|
||||
|
||||
fn explain(&self, reader: &SegmentReader, doc: DocId) -> crate::Result<Explanation> {
|
||||
let scorer_opt = self.phrase_scorer(reader, 1.0)?;
|
||||
if scorer_opt.is_none() {
|
||||
return Err(does_not_match(doc));
|
||||
}
|
||||
let mut scorer = scorer_opt.unwrap();
|
||||
if scorer.seek(doc) != doc {
|
||||
return Err(does_not_match(doc));
|
||||
}
|
||||
let fieldnorm_reader = self.fieldnorm_reader(reader)?;
|
||||
let fieldnorm_id = fieldnorm_reader.fieldnorm_id(doc);
|
||||
let phrase_count = scorer.phrase_count();
|
||||
let mut explanation = Explanation::new("Phrase Scorer", scorer.score());
|
||||
if let Some(similarity_weight) = self.similarity_weight_opt.as_ref() {
|
||||
explanation.add_detail(similarity_weight.explain(fieldnorm_id, phrase_count));
|
||||
}
|
||||
Ok(explanation)
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use proptest::prelude::*;
|
||||
use rand::seq::SliceRandom;
|
||||
|
||||
use super::super::tests::create_index;
|
||||
use crate::docset::TERMINATED;
|
||||
use crate::query::{wildcard_query_to_regex_str, EnableScoring, RegexPhraseQuery};
|
||||
use crate::DocSet;
|
||||
|
||||
proptest! {
|
||||
#![proptest_config(ProptestConfig::with_cases(50))]
|
||||
#[test]
|
||||
fn test_phrase_regex_with_random_strings(mut random_strings in proptest::collection::vec("[c-z ]{0,10}", 1..100), num_occurrences in 1..150_usize) {
|
||||
let mut rng = rand::thread_rng();
|
||||
|
||||
// Insert "aaa ccc" the specified number of times into the list
|
||||
for _ in 0..num_occurrences {
|
||||
random_strings.push("aaa ccc".to_string());
|
||||
}
|
||||
// Shuffle the list, which now contains random strings and the inserted "aaa ccc"
|
||||
random_strings.shuffle(&mut rng);
|
||||
|
||||
// Compute the positions of "aaa ccc" after the shuffle
|
||||
let aaa_ccc_positions: Vec<usize> = random_strings
|
||||
.iter()
|
||||
.enumerate()
|
||||
.filter_map(|(idx, s)| if s == "aaa ccc" { Some(idx) } else { None })
|
||||
.collect();
|
||||
|
||||
// Create the index with random strings and the fixed string "aaa ccc"
|
||||
let index = create_index(&random_strings.iter().map(AsRef::as_ref).collect::<Vec<&str>>())?;
|
||||
let schema = index.schema();
|
||||
let text_field = schema.get_field("text").unwrap();
|
||||
let searcher = index.reader()?.searcher();
|
||||
|
||||
let phrase_query = RegexPhraseQuery::new(text_field, vec![wildcard_query_to_regex_str("a*"), wildcard_query_to_regex_str("c*")]);
|
||||
|
||||
let enable_scoring = EnableScoring::enabled_from_searcher(&searcher);
|
||||
let phrase_weight = phrase_query.regex_phrase_weight(enable_scoring).unwrap();
|
||||
let mut phrase_scorer = phrase_weight
|
||||
.phrase_scorer(searcher.segment_reader(0u32), 1.0)?
|
||||
.unwrap();
|
||||
|
||||
// Check if the scorer returns the correct document positions for "aaa ccc"
|
||||
for expected_doc in aaa_ccc_positions {
|
||||
prop_assert_eq!(phrase_scorer.doc(), expected_doc as u32);
|
||||
prop_assert_eq!(phrase_scorer.phrase_count(), 1);
|
||||
phrase_scorer.advance();
|
||||
}
|
||||
prop_assert_eq!(phrase_scorer.advance(), TERMINATED);
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
pub fn test_phrase_count() -> crate::Result<()> {
|
||||
let index = create_index(&["a c", "a a b d a b c", " a b"])?;
|
||||
let schema = index.schema();
|
||||
let text_field = schema.get_field("text").unwrap();
|
||||
let searcher = index.reader()?.searcher();
|
||||
let phrase_query = RegexPhraseQuery::new(text_field, vec!["a".into(), "b".into()]);
|
||||
let enable_scoring = EnableScoring::enabled_from_searcher(&searcher);
|
||||
let phrase_weight = phrase_query.regex_phrase_weight(enable_scoring).unwrap();
|
||||
let mut phrase_scorer = phrase_weight
|
||||
.phrase_scorer(searcher.segment_reader(0u32), 1.0)?
|
||||
.unwrap();
|
||||
assert_eq!(phrase_scorer.doc(), 1);
|
||||
assert_eq!(phrase_scorer.phrase_count(), 2);
|
||||
assert_eq!(phrase_scorer.advance(), 2);
|
||||
assert_eq!(phrase_scorer.doc(), 2);
|
||||
assert_eq!(phrase_scorer.phrase_count(), 1);
|
||||
assert_eq!(phrase_scorer.advance(), TERMINATED);
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[test]
|
||||
pub fn test_phrase_wildcard() -> crate::Result<()> {
|
||||
let index = create_index(&["a c", "a aa b d ad b c", " ac b", "bac b"])?;
|
||||
let schema = index.schema();
|
||||
let text_field = schema.get_field("text").unwrap();
|
||||
let searcher = index.reader()?.searcher();
|
||||
let phrase_query = RegexPhraseQuery::new(text_field, vec!["a.*".into(), "b".into()]);
|
||||
let enable_scoring = EnableScoring::enabled_from_searcher(&searcher);
|
||||
let phrase_weight = phrase_query.regex_phrase_weight(enable_scoring).unwrap();
|
||||
let mut phrase_scorer = phrase_weight
|
||||
.phrase_scorer(searcher.segment_reader(0u32), 1.0)?
|
||||
.unwrap();
|
||||
assert_eq!(phrase_scorer.doc(), 1);
|
||||
assert_eq!(phrase_scorer.phrase_count(), 2);
|
||||
assert_eq!(phrase_scorer.advance(), 2);
|
||||
assert_eq!(phrase_scorer.doc(), 2);
|
||||
assert_eq!(phrase_scorer.phrase_count(), 1);
|
||||
assert_eq!(phrase_scorer.advance(), TERMINATED);
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[test]
|
||||
pub fn test_phrase_regex() -> crate::Result<()> {
|
||||
let index = create_index(&["ba b", "a aa b d ad b c", "bac b"])?;
|
||||
let schema = index.schema();
|
||||
let text_field = schema.get_field("text").unwrap();
|
||||
let searcher = index.reader()?.searcher();
|
||||
let phrase_query = RegexPhraseQuery::new(text_field, vec!["b?a.*".into(), "b".into()]);
|
||||
let enable_scoring = EnableScoring::enabled_from_searcher(&searcher);
|
||||
let phrase_weight = phrase_query.regex_phrase_weight(enable_scoring).unwrap();
|
||||
let mut phrase_scorer = phrase_weight
|
||||
.phrase_scorer(searcher.segment_reader(0u32), 1.0)?
|
||||
.unwrap();
|
||||
assert_eq!(phrase_scorer.doc(), 0);
|
||||
assert_eq!(phrase_scorer.phrase_count(), 1);
|
||||
assert_eq!(phrase_scorer.advance(), 1);
|
||||
assert_eq!(phrase_scorer.phrase_count(), 2);
|
||||
assert_eq!(phrase_scorer.advance(), 2);
|
||||
assert_eq!(phrase_scorer.doc(), 2);
|
||||
assert_eq!(phrase_scorer.phrase_count(), 1);
|
||||
assert_eq!(phrase_scorer.advance(), TERMINATED);
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[test]
|
||||
pub fn test_phrase_regex_with_slop() -> crate::Result<()> {
|
||||
let index = create_index(&["aaa bbb ccc ___ abc ddd bbb ccc"])?;
|
||||
let schema = index.schema();
|
||||
let text_field = schema.get_field("text").unwrap();
|
||||
let searcher = index.reader()?.searcher();
|
||||
let mut phrase_query = RegexPhraseQuery::new(text_field, vec!["a.*".into(), "c.*".into()]);
|
||||
phrase_query.set_slop(1);
|
||||
let enable_scoring = EnableScoring::enabled_from_searcher(&searcher);
|
||||
let phrase_weight = phrase_query.regex_phrase_weight(enable_scoring).unwrap();
|
||||
let mut phrase_scorer = phrase_weight
|
||||
.phrase_scorer(searcher.segment_reader(0u32), 1.0)?
|
||||
.unwrap();
|
||||
assert_eq!(phrase_scorer.doc(), 0);
|
||||
assert_eq!(phrase_scorer.phrase_count(), 1);
|
||||
assert_eq!(phrase_scorer.advance(), TERMINATED);
|
||||
|
||||
phrase_query.set_slop(2);
|
||||
let enable_scoring = EnableScoring::enabled_from_searcher(&searcher);
|
||||
let phrase_weight = phrase_query.regex_phrase_weight(enable_scoring).unwrap();
|
||||
let mut phrase_scorer = phrase_weight
|
||||
.phrase_scorer(searcher.segment_reader(0u32), 1.0)?
|
||||
.unwrap();
|
||||
assert_eq!(phrase_scorer.doc(), 0);
|
||||
assert_eq!(phrase_scorer.phrase_count(), 2);
|
||||
assert_eq!(phrase_scorer.advance(), TERMINATED);
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[test]
|
||||
pub fn test_phrase_regex_double_wildcard() -> crate::Result<()> {
|
||||
let index = create_index(&["baaab bccccb"])?;
|
||||
let schema = index.schema();
|
||||
let text_field = schema.get_field("text").unwrap();
|
||||
let searcher = index.reader()?.searcher();
|
||||
let phrase_query = RegexPhraseQuery::new(
|
||||
text_field,
|
||||
vec![
|
||||
wildcard_query_to_regex_str("*a*"),
|
||||
wildcard_query_to_regex_str("*c*"),
|
||||
],
|
||||
);
|
||||
let enable_scoring = EnableScoring::enabled_from_searcher(&searcher);
|
||||
let phrase_weight = phrase_query.regex_phrase_weight(enable_scoring).unwrap();
|
||||
let mut phrase_scorer = phrase_weight
|
||||
.phrase_scorer(searcher.segment_reader(0u32), 1.0)?
|
||||
.unwrap();
|
||||
assert_eq!(phrase_scorer.doc(), 0);
|
||||
assert_eq!(phrase_scorer.phrase_count(), 1);
|
||||
assert_eq!(phrase_scorer.advance(), TERMINATED);
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
@@ -402,7 +402,7 @@ fn search_on_u64_ff(
|
||||
boost: Score,
|
||||
bounds: BoundsRange<u64>,
|
||||
) -> crate::Result<Box<dyn Scorer>> {
|
||||
#[allow(clippy::reversed_empty_ranges)]
|
||||
#[expect(clippy::reversed_empty_ranges)]
|
||||
let value_range = bound_to_value_range(
|
||||
&bounds.lower_bound,
|
||||
&bounds.upper_bound,
|
||||
@@ -1386,7 +1386,7 @@ mod tests {
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
pub mod ip_range_tests {
|
||||
pub(crate) mod ip_range_tests {
|
||||
use proptest::prelude::ProptestConfig;
|
||||
use proptest::strategy::Strategy;
|
||||
use proptest::{prop_oneof, proptest};
|
||||
|
||||
89
src/query/union/bitset_union.rs
Normal file
89
src/query/union/bitset_union.rs
Normal file
@@ -0,0 +1,89 @@
|
||||
use std::cell::RefCell;
|
||||
|
||||
use crate::docset::DocSet;
|
||||
use crate::postings::Postings;
|
||||
use crate::query::BitSetDocSet;
|
||||
use crate::DocId;
|
||||
|
||||
/// Creates a `Posting` that uses the bitset for hits and the docsets for PostingLists.
|
||||
///
|
||||
/// It is used for the regex phrase query, where we need the union of a large amount of
|
||||
/// terms, but need to keep the docsets for the postings.
|
||||
pub struct BitSetPostingUnion<TDocSet> {
|
||||
/// The docsets are required to load positions
|
||||
///
|
||||
/// RefCell because we mutate in term_freq
|
||||
docsets: RefCell<Vec<TDocSet>>,
|
||||
/// The already unionized BitSet of the docsets
|
||||
bitset: BitSetDocSet,
|
||||
}
|
||||
|
||||
impl<TDocSet: DocSet> BitSetPostingUnion<TDocSet> {
|
||||
pub(crate) fn build(
|
||||
docsets: Vec<TDocSet>,
|
||||
bitset: BitSetDocSet,
|
||||
) -> BitSetPostingUnion<TDocSet> {
|
||||
BitSetPostingUnion {
|
||||
docsets: RefCell::new(docsets),
|
||||
bitset,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl<TDocSet: Postings> Postings for BitSetPostingUnion<TDocSet> {
|
||||
fn term_freq(&self) -> u32 {
|
||||
let curr_doc = self.bitset.doc();
|
||||
let mut term_freq = 0;
|
||||
let mut docsets = self.docsets.borrow_mut();
|
||||
for docset in docsets.iter_mut() {
|
||||
if docset.doc() < curr_doc {
|
||||
docset.seek(curr_doc);
|
||||
}
|
||||
if docset.doc() == curr_doc {
|
||||
term_freq += docset.term_freq();
|
||||
}
|
||||
}
|
||||
term_freq
|
||||
}
|
||||
|
||||
fn append_positions_with_offset(&mut self, offset: u32, output: &mut Vec<u32>) {
|
||||
let curr_doc = self.bitset.doc();
|
||||
let mut docsets = self.docsets.borrow_mut();
|
||||
for docset in docsets.iter_mut() {
|
||||
if docset.doc() < curr_doc {
|
||||
docset.seek(curr_doc);
|
||||
}
|
||||
if docset.doc() == curr_doc {
|
||||
docset.append_positions_with_offset(offset, output);
|
||||
}
|
||||
}
|
||||
debug_assert!(
|
||||
!output.is_empty(),
|
||||
"this method should only be called if positions are available"
|
||||
);
|
||||
output.sort_unstable();
|
||||
output.dedup();
|
||||
}
|
||||
}
|
||||
|
||||
impl<TDocSet: DocSet> DocSet for BitSetPostingUnion<TDocSet> {
|
||||
fn advance(&mut self) -> DocId {
|
||||
self.bitset.advance()
|
||||
}
|
||||
|
||||
fn seek(&mut self, target: DocId) -> DocId {
|
||||
self.bitset.seek(target)
|
||||
}
|
||||
|
||||
fn doc(&self) -> DocId {
|
||||
self.bitset.doc()
|
||||
}
|
||||
|
||||
fn size_hint(&self) -> u32 {
|
||||
self.bitset.size_hint()
|
||||
}
|
||||
|
||||
fn count_including_deleted(&mut self) -> u32 {
|
||||
self.bitset.count_including_deleted()
|
||||
}
|
||||
}
|
||||
@@ -26,7 +26,7 @@ where P: FnMut(&mut T) -> bool {
|
||||
}
|
||||
|
||||
/// Creates a `DocSet` that iterate through the union of two or more `DocSet`s.
|
||||
pub struct Union<TScorer, TScoreCombiner = DoNothingCombiner> {
|
||||
pub struct BufferedUnionScorer<TScorer, TScoreCombiner = DoNothingCombiner> {
|
||||
docsets: Vec<TScorer>,
|
||||
bitsets: Box<[TinySet; HORIZON_NUM_TINYBITSETS]>,
|
||||
scores: Box<[TScoreCombiner; HORIZON as usize]>,
|
||||
@@ -61,16 +61,16 @@ fn refill<TScorer: Scorer, TScoreCombiner: ScoreCombiner>(
|
||||
});
|
||||
}
|
||||
|
||||
impl<TScorer: Scorer, TScoreCombiner: ScoreCombiner> Union<TScorer, TScoreCombiner> {
|
||||
impl<TScorer: Scorer, TScoreCombiner: ScoreCombiner> BufferedUnionScorer<TScorer, TScoreCombiner> {
|
||||
pub(crate) fn build(
|
||||
docsets: Vec<TScorer>,
|
||||
score_combiner_fn: impl FnOnce() -> TScoreCombiner,
|
||||
) -> Union<TScorer, TScoreCombiner> {
|
||||
) -> BufferedUnionScorer<TScorer, TScoreCombiner> {
|
||||
let non_empty_docsets: Vec<TScorer> = docsets
|
||||
.into_iter()
|
||||
.filter(|docset| docset.doc() != TERMINATED)
|
||||
.collect();
|
||||
let mut union = Union {
|
||||
let mut union = BufferedUnionScorer {
|
||||
docsets: non_empty_docsets,
|
||||
bitsets: Box::new([TinySet::empty(); HORIZON_NUM_TINYBITSETS]),
|
||||
scores: Box::new([score_combiner_fn(); HORIZON as usize]),
|
||||
@@ -121,7 +121,7 @@ impl<TScorer: Scorer, TScoreCombiner: ScoreCombiner> Union<TScorer, TScoreCombin
|
||||
}
|
||||
}
|
||||
|
||||
impl<TScorer, TScoreCombiner> DocSet for Union<TScorer, TScoreCombiner>
|
||||
impl<TScorer, TScoreCombiner> DocSet for BufferedUnionScorer<TScorer, TScoreCombiner>
|
||||
where
|
||||
TScorer: Scorer,
|
||||
TScoreCombiner: ScoreCombiner,
|
||||
@@ -230,7 +230,7 @@ where
|
||||
}
|
||||
}
|
||||
|
||||
impl<TScorer, TScoreCombiner> Scorer for Union<TScorer, TScoreCombiner>
|
||||
impl<TScorer, TScoreCombiner> Scorer for BufferedUnionScorer<TScorer, TScoreCombiner>
|
||||
where
|
||||
TScoreCombiner: ScoreCombiner,
|
||||
TScorer: Scorer,
|
||||
@@ -239,205 +239,3 @@ where
|
||||
self.score
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
|
||||
use std::collections::BTreeSet;
|
||||
|
||||
use super::{Union, HORIZON};
|
||||
use crate::docset::{DocSet, TERMINATED};
|
||||
use crate::postings::tests::test_skip_against_unoptimized;
|
||||
use crate::query::score_combiner::DoNothingCombiner;
|
||||
use crate::query::{ConstScorer, VecDocSet};
|
||||
use crate::{tests, DocId};
|
||||
|
||||
fn aux_test_union(vals: Vec<Vec<u32>>) {
|
||||
let mut val_set: BTreeSet<u32> = BTreeSet::new();
|
||||
for vs in &vals {
|
||||
for &v in vs {
|
||||
val_set.insert(v);
|
||||
}
|
||||
}
|
||||
let union_vals: Vec<u32> = val_set.into_iter().collect();
|
||||
let mut union_expected = VecDocSet::from(union_vals);
|
||||
let make_union = || {
|
||||
Union::build(
|
||||
vals.iter()
|
||||
.cloned()
|
||||
.map(VecDocSet::from)
|
||||
.map(|docset| ConstScorer::new(docset, 1.0))
|
||||
.collect::<Vec<ConstScorer<VecDocSet>>>(),
|
||||
DoNothingCombiner::default,
|
||||
)
|
||||
};
|
||||
let mut union: Union<_, DoNothingCombiner> = make_union();
|
||||
let mut count = 0;
|
||||
while union.doc() != TERMINATED {
|
||||
assert_eq!(union_expected.doc(), union.doc());
|
||||
assert_eq!(union_expected.advance(), union.advance());
|
||||
count += 1;
|
||||
}
|
||||
assert_eq!(union_expected.advance(), TERMINATED);
|
||||
assert_eq!(count, make_union().count_including_deleted());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_union() {
|
||||
aux_test_union(vec![
|
||||
vec![1, 3333, 100000000u32],
|
||||
vec![1, 2, 100000000u32],
|
||||
vec![1, 2, 100000000u32],
|
||||
vec![],
|
||||
]);
|
||||
aux_test_union(vec![
|
||||
vec![1, 3333, 100000000u32],
|
||||
vec![1, 2, 100000000u32],
|
||||
vec![1, 2, 100000000u32],
|
||||
vec![],
|
||||
]);
|
||||
aux_test_union(vec![
|
||||
tests::sample_with_seed(100_000, 0.01, 1),
|
||||
tests::sample_with_seed(100_000, 0.05, 2),
|
||||
tests::sample_with_seed(100_000, 0.001, 3),
|
||||
]);
|
||||
}
|
||||
|
||||
fn test_aux_union_skip(docs_list: &[Vec<DocId>], skip_targets: Vec<DocId>) {
|
||||
let mut btree_set = BTreeSet::new();
|
||||
for docs in docs_list {
|
||||
btree_set.extend(docs.iter().cloned());
|
||||
}
|
||||
let docset_factory = || {
|
||||
let res: Box<dyn DocSet> = Box::new(Union::build(
|
||||
docs_list
|
||||
.iter()
|
||||
.cloned()
|
||||
.map(VecDocSet::from)
|
||||
.map(|docset| ConstScorer::new(docset, 1.0))
|
||||
.collect::<Vec<_>>(),
|
||||
DoNothingCombiner::default,
|
||||
));
|
||||
res
|
||||
};
|
||||
let mut docset = docset_factory();
|
||||
for el in btree_set {
|
||||
assert_eq!(el, docset.doc());
|
||||
docset.advance();
|
||||
}
|
||||
assert_eq!(docset.doc(), TERMINATED);
|
||||
test_skip_against_unoptimized(docset_factory, skip_targets);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_union_skip_corner_case() {
|
||||
test_aux_union_skip(&[vec![165132, 167382], vec![25029, 25091]], vec![25029]);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_union_skip_corner_case2() {
|
||||
test_aux_union_skip(
|
||||
&[vec![1u32, 1u32 + HORIZON], vec![2u32, 1000u32, 10_000u32]],
|
||||
vec![0u32, 1u32, 2u32, 3u32, 1u32 + HORIZON, 2u32 + HORIZON],
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_union_skip_corner_case3() {
|
||||
let mut docset = Union::build(
|
||||
vec![
|
||||
ConstScorer::from(VecDocSet::from(vec![0u32, 5u32])),
|
||||
ConstScorer::from(VecDocSet::from(vec![1u32, 4u32])),
|
||||
],
|
||||
DoNothingCombiner::default,
|
||||
);
|
||||
assert_eq!(docset.doc(), 0u32);
|
||||
assert_eq!(docset.seek(0u32), 0u32);
|
||||
assert_eq!(docset.seek(0u32), 0u32);
|
||||
assert_eq!(docset.doc(), 0u32)
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_union_skip_random() {
|
||||
test_aux_union_skip(
|
||||
&[
|
||||
vec![1, 2, 3, 7],
|
||||
vec![1, 3, 9, 10000],
|
||||
vec![1, 3, 8, 9, 100],
|
||||
],
|
||||
vec![1, 2, 3, 5, 6, 7, 8, 100],
|
||||
);
|
||||
test_aux_union_skip(
|
||||
&[
|
||||
tests::sample_with_seed(100_000, 0.001, 1),
|
||||
tests::sample_with_seed(100_000, 0.002, 2),
|
||||
tests::sample_with_seed(100_000, 0.005, 3),
|
||||
],
|
||||
tests::sample_with_seed(100_000, 0.01, 4),
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_union_skip_specific() {
|
||||
test_aux_union_skip(
|
||||
&[
|
||||
vec![1, 2, 3, 7],
|
||||
vec![1, 3, 9, 10000],
|
||||
vec![1, 3, 8, 9, 100],
|
||||
],
|
||||
vec![1, 2, 3, 7, 8, 9, 99, 100, 101, 500, 20000],
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(all(test, feature = "unstable"))]
|
||||
mod bench {
|
||||
|
||||
use test::Bencher;
|
||||
|
||||
use crate::query::score_combiner::DoNothingCombiner;
|
||||
use crate::query::{ConstScorer, Union, VecDocSet};
|
||||
use crate::{tests, DocId, DocSet, TERMINATED};
|
||||
|
||||
#[bench]
|
||||
fn bench_union_3_high(bench: &mut Bencher) {
|
||||
let union_docset: Vec<Vec<DocId>> = vec![
|
||||
tests::sample_with_seed(100_000, 0.1, 0),
|
||||
tests::sample_with_seed(100_000, 0.2, 1),
|
||||
];
|
||||
bench.iter(|| {
|
||||
let mut v = Union::build(
|
||||
union_docset
|
||||
.iter()
|
||||
.map(|doc_ids| VecDocSet::from(doc_ids.clone()))
|
||||
.map(|docset| ConstScorer::new(docset, 1.0))
|
||||
.collect::<Vec<_>>(),
|
||||
DoNothingCombiner::default,
|
||||
);
|
||||
while v.doc() != TERMINATED {
|
||||
v.advance();
|
||||
}
|
||||
});
|
||||
}
|
||||
#[bench]
|
||||
fn bench_union_3_low(bench: &mut Bencher) {
|
||||
let union_docset: Vec<Vec<DocId>> = vec![
|
||||
tests::sample_with_seed(100_000, 0.01, 0),
|
||||
tests::sample_with_seed(100_000, 0.05, 1),
|
||||
tests::sample_with_seed(100_000, 0.001, 2),
|
||||
];
|
||||
bench.iter(|| {
|
||||
let mut v = Union::build(
|
||||
union_docset
|
||||
.iter()
|
||||
.map(|doc_ids| VecDocSet::from(doc_ids.clone()))
|
||||
.map(|docset| ConstScorer::new(docset, 1.0))
|
||||
.collect::<Vec<_>>(),
|
||||
DoNothingCombiner::default,
|
||||
);
|
||||
while v.doc() != TERMINATED {
|
||||
v.advance();
|
||||
}
|
||||
});
|
||||
}
|
||||
}
|
||||
303
src/query/union/mod.rs
Normal file
303
src/query/union/mod.rs
Normal file
@@ -0,0 +1,303 @@
|
||||
mod bitset_union;
|
||||
mod buffered_union;
|
||||
mod simple_union;
|
||||
|
||||
pub use bitset_union::BitSetPostingUnion;
|
||||
pub use buffered_union::BufferedUnionScorer;
|
||||
pub use simple_union::SimpleUnion;
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
|
||||
use std::collections::BTreeSet;
|
||||
|
||||
use common::BitSet;
|
||||
|
||||
use super::{SimpleUnion, *};
|
||||
use crate::docset::{DocSet, TERMINATED};
|
||||
use crate::postings::tests::test_skip_against_unoptimized;
|
||||
use crate::query::score_combiner::DoNothingCombiner;
|
||||
use crate::query::union::bitset_union::BitSetPostingUnion;
|
||||
use crate::query::{BitSetDocSet, ConstScorer, VecDocSet};
|
||||
use crate::{tests, DocId};
|
||||
|
||||
fn vec_doc_set_from_docs_list(
|
||||
docs_list: &[Vec<DocId>],
|
||||
) -> impl Iterator<Item = VecDocSet> + '_ {
|
||||
docs_list.iter().cloned().map(VecDocSet::from)
|
||||
}
|
||||
fn union_from_docs_list(docs_list: &[Vec<DocId>]) -> Box<dyn DocSet> {
|
||||
Box::new(BufferedUnionScorer::build(
|
||||
vec_doc_set_from_docs_list(docs_list)
|
||||
.map(|docset| ConstScorer::new(docset, 1.0))
|
||||
.collect::<Vec<ConstScorer<VecDocSet>>>(),
|
||||
DoNothingCombiner::default,
|
||||
))
|
||||
}
|
||||
|
||||
fn posting_list_union_from_docs_list(docs_list: &[Vec<DocId>]) -> Box<dyn DocSet> {
|
||||
Box::new(BitSetPostingUnion::build(
|
||||
vec_doc_set_from_docs_list(docs_list).collect::<Vec<VecDocSet>>(),
|
||||
bitset_from_docs_list(docs_list),
|
||||
))
|
||||
}
|
||||
fn simple_union_from_docs_list(docs_list: &[Vec<DocId>]) -> Box<dyn DocSet> {
|
||||
Box::new(SimpleUnion::build(
|
||||
vec_doc_set_from_docs_list(docs_list).collect::<Vec<VecDocSet>>(),
|
||||
))
|
||||
}
|
||||
fn bitset_from_docs_list(docs_list: &[Vec<DocId>]) -> BitSetDocSet {
|
||||
let max_doc = docs_list
|
||||
.iter()
|
||||
.flat_map(|docs| docs.iter().copied())
|
||||
.max()
|
||||
.unwrap_or(0);
|
||||
let mut doc_bitset = BitSet::with_max_value(max_doc + 1);
|
||||
for docs in docs_list {
|
||||
for &doc in docs {
|
||||
doc_bitset.insert(doc);
|
||||
}
|
||||
}
|
||||
BitSetDocSet::from(doc_bitset)
|
||||
}
|
||||
fn aux_test_union(docs_list: &[Vec<DocId>]) {
|
||||
for constructor in [
|
||||
posting_list_union_from_docs_list,
|
||||
simple_union_from_docs_list,
|
||||
union_from_docs_list,
|
||||
] {
|
||||
aux_test_union_with_constructor(constructor, docs_list);
|
||||
}
|
||||
}
|
||||
fn aux_test_union_with_constructor<F>(constructor: F, docs_list: &[Vec<DocId>])
|
||||
where F: Fn(&[Vec<DocId>]) -> Box<dyn DocSet> {
|
||||
let mut val_set: BTreeSet<u32> = BTreeSet::new();
|
||||
for vs in docs_list {
|
||||
for &v in vs {
|
||||
val_set.insert(v);
|
||||
}
|
||||
}
|
||||
let union_vals: Vec<u32> = val_set.into_iter().collect();
|
||||
let mut union_expected = VecDocSet::from(union_vals);
|
||||
let make_union = || constructor(docs_list);
|
||||
let mut union = make_union();
|
||||
let mut count = 0;
|
||||
while union.doc() != TERMINATED {
|
||||
assert_eq!(union_expected.doc(), union.doc());
|
||||
assert_eq!(union_expected.advance(), union.advance());
|
||||
count += 1;
|
||||
}
|
||||
assert_eq!(union_expected.advance(), TERMINATED);
|
||||
assert_eq!(count, make_union().count_including_deleted());
|
||||
}
|
||||
|
||||
use proptest::prelude::*;
|
||||
|
||||
proptest! {
|
||||
#[test]
|
||||
fn test_union_is_same(vecs in prop::collection::vec(
|
||||
prop::collection::vec(0u32..100, 1..10)
|
||||
.prop_map(|mut inner| {
|
||||
inner.sort_unstable();
|
||||
inner.dedup();
|
||||
inner
|
||||
}),
|
||||
1..10
|
||||
),
|
||||
seek_docids in prop::collection::vec(0u32..100, 0..10).prop_map(|mut inner| {
|
||||
inner.sort_unstable();
|
||||
inner
|
||||
})) {
|
||||
test_docid_with_skip(&vecs, &seek_docids);
|
||||
}
|
||||
}
|
||||
|
||||
fn test_docid_with_skip(vecs: &[Vec<DocId>], skip_targets: &[DocId]) {
|
||||
let mut union1 = posting_list_union_from_docs_list(vecs);
|
||||
let mut union2 = simple_union_from_docs_list(vecs);
|
||||
let mut union3 = union_from_docs_list(vecs);
|
||||
|
||||
// Check initial sequential advance
|
||||
while union1.doc() != TERMINATED {
|
||||
assert_eq!(union1.doc(), union2.doc());
|
||||
assert_eq!(union1.doc(), union3.doc());
|
||||
assert_eq!(union1.advance(), union2.advance());
|
||||
assert_eq!(union1.doc(), union3.advance());
|
||||
}
|
||||
|
||||
// Reset and test seek functionality
|
||||
let mut union1 = posting_list_union_from_docs_list(vecs);
|
||||
let mut union2 = simple_union_from_docs_list(vecs);
|
||||
let mut union3 = union_from_docs_list(vecs);
|
||||
|
||||
for &seek_docid in skip_targets {
|
||||
union1.seek(seek_docid);
|
||||
union2.seek(seek_docid);
|
||||
union3.seek(seek_docid);
|
||||
|
||||
// Verify that all unions have the same document after seeking
|
||||
assert_eq!(union3.doc(), union1.doc());
|
||||
assert_eq!(union3.doc(), union2.doc());
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_union() {
|
||||
aux_test_union(&[
|
||||
vec![1, 3333, 100000000u32],
|
||||
vec![1, 2, 100000000u32],
|
||||
vec![1, 2, 100000000u32],
|
||||
vec![],
|
||||
]);
|
||||
aux_test_union(&[
|
||||
vec![1, 3333, 100000000u32],
|
||||
vec![1, 2, 100000000u32],
|
||||
vec![1, 2, 100000000u32],
|
||||
vec![],
|
||||
]);
|
||||
aux_test_union(&[
|
||||
tests::sample_with_seed(100_000, 0.01, 1),
|
||||
tests::sample_with_seed(100_000, 0.05, 2),
|
||||
tests::sample_with_seed(100_000, 0.001, 3),
|
||||
]);
|
||||
}
|
||||
|
||||
fn test_aux_union_skip(docs_list: &[Vec<DocId>], skip_targets: Vec<DocId>) {
|
||||
for constructor in [
|
||||
posting_list_union_from_docs_list,
|
||||
simple_union_from_docs_list,
|
||||
union_from_docs_list,
|
||||
] {
|
||||
test_aux_union_skip_with_constructor(constructor, docs_list, skip_targets.clone());
|
||||
}
|
||||
}
|
||||
fn test_aux_union_skip_with_constructor<F>(
|
||||
constructor: F,
|
||||
docs_list: &[Vec<DocId>],
|
||||
skip_targets: Vec<DocId>,
|
||||
) where
|
||||
F: Fn(&[Vec<DocId>]) -> Box<dyn DocSet>,
|
||||
{
|
||||
let mut btree_set = BTreeSet::new();
|
||||
for docs in docs_list {
|
||||
btree_set.extend(docs.iter().cloned());
|
||||
}
|
||||
let docset_factory = || {
|
||||
let res: Box<dyn DocSet> = constructor(docs_list);
|
||||
res
|
||||
};
|
||||
let mut docset = constructor(docs_list);
|
||||
for el in btree_set {
|
||||
assert_eq!(el, docset.doc());
|
||||
docset.advance();
|
||||
}
|
||||
assert_eq!(docset.doc(), TERMINATED);
|
||||
test_skip_against_unoptimized(docset_factory, skip_targets);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_union_skip_corner_case() {
|
||||
test_aux_union_skip(&[vec![165132, 167382], vec![25029, 25091]], vec![25029]);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_union_skip_corner_case2() {
|
||||
test_aux_union_skip(
|
||||
&[vec![1u32, 1u32 + 100], vec![2u32, 1000u32, 10_000u32]],
|
||||
vec![0u32, 1u32, 2u32, 3u32, 1u32 + 100, 2u32 + 100],
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_union_skip_corner_case3() {
|
||||
let mut docset = posting_list_union_from_docs_list(&[vec![0u32, 5u32], vec![1u32, 4u32]]);
|
||||
assert_eq!(docset.doc(), 0u32);
|
||||
assert_eq!(docset.seek(0u32), 0u32);
|
||||
assert_eq!(docset.seek(0u32), 0u32);
|
||||
assert_eq!(docset.doc(), 0u32)
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_union_skip_random() {
|
||||
test_aux_union_skip(
|
||||
&[
|
||||
vec![1, 2, 3, 7],
|
||||
vec![1, 3, 9, 10000],
|
||||
vec![1, 3, 8, 9, 100],
|
||||
],
|
||||
vec![1, 2, 3, 5, 6, 7, 8, 100],
|
||||
);
|
||||
test_aux_union_skip(
|
||||
&[
|
||||
tests::sample_with_seed(100_000, 0.001, 1),
|
||||
tests::sample_with_seed(100_000, 0.002, 2),
|
||||
tests::sample_with_seed(100_000, 0.005, 3),
|
||||
],
|
||||
tests::sample_with_seed(100_000, 0.01, 4),
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_union_skip_specific() {
|
||||
test_aux_union_skip(
|
||||
&[
|
||||
vec![1, 2, 3, 7],
|
||||
vec![1, 3, 9, 10000],
|
||||
vec![1, 3, 8, 9, 100],
|
||||
],
|
||||
vec![1, 2, 3, 7, 8, 9, 99, 100, 101, 500, 20000],
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(all(test, feature = "unstable"))]
|
||||
mod bench {
|
||||
|
||||
use test::Bencher;
|
||||
|
||||
use crate::query::score_combiner::DoNothingCombiner;
|
||||
use crate::query::{BufferedUnionScorer, ConstScorer, VecDocSet};
|
||||
use crate::{tests, DocId, DocSet, TERMINATED};
|
||||
|
||||
#[bench]
|
||||
fn bench_union_3_high(bench: &mut Bencher) {
|
||||
let union_docset: Vec<Vec<DocId>> = vec![
|
||||
tests::sample_with_seed(100_000, 0.1, 0),
|
||||
tests::sample_with_seed(100_000, 0.2, 1),
|
||||
];
|
||||
bench.iter(|| {
|
||||
let mut v = BufferedUnionScorer::build(
|
||||
union_docset
|
||||
.iter()
|
||||
.map(|doc_ids| VecDocSet::from(doc_ids.clone()))
|
||||
.map(|docset| ConstScorer::new(docset, 1.0))
|
||||
.collect::<Vec<_>>(),
|
||||
DoNothingCombiner::default,
|
||||
);
|
||||
while v.doc() != TERMINATED {
|
||||
v.advance();
|
||||
}
|
||||
});
|
||||
}
|
||||
#[bench]
|
||||
fn bench_union_3_low(bench: &mut Bencher) {
|
||||
let union_docset: Vec<Vec<DocId>> = vec![
|
||||
tests::sample_with_seed(100_000, 0.01, 0),
|
||||
tests::sample_with_seed(100_000, 0.05, 1),
|
||||
tests::sample_with_seed(100_000, 0.001, 2),
|
||||
];
|
||||
bench.iter(|| {
|
||||
let mut v = BufferedUnionScorer::build(
|
||||
union_docset
|
||||
.iter()
|
||||
.map(|doc_ids| VecDocSet::from(doc_ids.clone()))
|
||||
.map(|docset| ConstScorer::new(docset, 1.0))
|
||||
.collect::<Vec<_>>(),
|
||||
DoNothingCombiner::default,
|
||||
);
|
||||
while v.doc() != TERMINATED {
|
||||
v.advance();
|
||||
}
|
||||
});
|
||||
}
|
||||
}
|
||||
112
src/query/union/simple_union.rs
Normal file
112
src/query/union/simple_union.rs
Normal file
@@ -0,0 +1,112 @@
|
||||
use crate::docset::{DocSet, TERMINATED};
|
||||
use crate::postings::Postings;
|
||||
use crate::DocId;
|
||||
|
||||
/// A `SimpleUnion` is a `DocSet` that is the union of multiple `DocSet`.
|
||||
/// Unlike `BufferedUnion`, it doesn't do any horizon precomputation.
|
||||
///
|
||||
/// For that reason SimpleUnion is a good choice for queries that skip a lot.
|
||||
pub struct SimpleUnion<TDocSet> {
|
||||
docsets: Vec<TDocSet>,
|
||||
doc: DocId,
|
||||
}
|
||||
|
||||
impl<TDocSet: DocSet> SimpleUnion<TDocSet> {
|
||||
pub(crate) fn build(mut docsets: Vec<TDocSet>) -> SimpleUnion<TDocSet> {
|
||||
docsets.retain(|docset| docset.doc() != TERMINATED);
|
||||
let mut docset = SimpleUnion { docsets, doc: 0 };
|
||||
|
||||
docset.initialize_first_doc_id();
|
||||
|
||||
docset
|
||||
}
|
||||
|
||||
fn initialize_first_doc_id(&mut self) {
|
||||
let mut next_doc = TERMINATED;
|
||||
|
||||
for docset in &self.docsets {
|
||||
next_doc = next_doc.min(docset.doc());
|
||||
}
|
||||
self.doc = next_doc;
|
||||
}
|
||||
|
||||
fn advance_to_next(&mut self) -> DocId {
|
||||
let mut next_doc = TERMINATED;
|
||||
|
||||
for docset in &mut self.docsets {
|
||||
if docset.doc() <= self.doc {
|
||||
docset.advance();
|
||||
}
|
||||
next_doc = next_doc.min(docset.doc());
|
||||
}
|
||||
self.doc = next_doc;
|
||||
self.doc
|
||||
}
|
||||
}
|
||||
|
||||
impl<TDocSet: Postings> Postings for SimpleUnion<TDocSet> {
|
||||
fn term_freq(&self) -> u32 {
|
||||
let mut term_freq = 0;
|
||||
for docset in &self.docsets {
|
||||
let doc = docset.doc();
|
||||
if doc == self.doc {
|
||||
term_freq += docset.term_freq();
|
||||
}
|
||||
}
|
||||
term_freq
|
||||
}
|
||||
|
||||
fn append_positions_with_offset(&mut self, offset: u32, output: &mut Vec<u32>) {
|
||||
for docset in &mut self.docsets {
|
||||
let doc = docset.doc();
|
||||
if doc == self.doc {
|
||||
docset.append_positions_with_offset(offset, output);
|
||||
}
|
||||
}
|
||||
output.sort_unstable();
|
||||
output.dedup();
|
||||
}
|
||||
}
|
||||
|
||||
impl<TDocSet: DocSet> DocSet for SimpleUnion<TDocSet> {
|
||||
fn advance(&mut self) -> DocId {
|
||||
self.advance_to_next();
|
||||
self.doc
|
||||
}
|
||||
|
||||
fn seek(&mut self, target: DocId) -> DocId {
|
||||
self.doc = TERMINATED;
|
||||
for docset in &mut self.docsets {
|
||||
if docset.doc() < target {
|
||||
docset.seek(target);
|
||||
}
|
||||
if docset.doc() < self.doc {
|
||||
self.doc = docset.doc();
|
||||
}
|
||||
}
|
||||
self.doc
|
||||
}
|
||||
|
||||
fn doc(&self) -> DocId {
|
||||
self.doc
|
||||
}
|
||||
|
||||
fn size_hint(&self) -> u32 {
|
||||
self.docsets
|
||||
.iter()
|
||||
.map(|docset| docset.size_hint())
|
||||
.max()
|
||||
.unwrap_or(0u32)
|
||||
}
|
||||
|
||||
fn count_including_deleted(&mut self) -> u32 {
|
||||
if self.doc == TERMINATED {
|
||||
return 0u32;
|
||||
}
|
||||
let mut count = 1u32;
|
||||
while self.advance_to_next() != TERMINATED {
|
||||
count += 1;
|
||||
}
|
||||
count
|
||||
}
|
||||
}
|
||||
@@ -50,7 +50,7 @@ impl HasLen for VecDocSet {
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
pub mod tests {
|
||||
pub(crate) mod tests {
|
||||
|
||||
use super::*;
|
||||
use crate::docset::COLLECT_BLOCK_BUFFER_LEN;
|
||||
|
||||
@@ -22,6 +22,7 @@ use super::se::BinaryObjectSerializer;
|
||||
use super::{OwnedValue, Value};
|
||||
use crate::schema::document::type_codes;
|
||||
use crate::schema::{Facet, Field};
|
||||
use crate::store::DocStoreVersion;
|
||||
use crate::tokenizer::PreTokenizedString;
|
||||
|
||||
#[derive(Debug, thiserror::Error, Clone)]
|
||||
@@ -45,6 +46,9 @@ pub enum DeserializeError {
|
||||
#[error("{0}")]
|
||||
/// A custom error message.
|
||||
Custom(String),
|
||||
#[error("Version {0}, Max version supported: {1}")]
|
||||
/// Unsupported version error.
|
||||
UnsupportedVersion(u32, u32),
|
||||
}
|
||||
|
||||
impl DeserializeError {
|
||||
@@ -291,6 +295,7 @@ pub trait ObjectAccess<'de> {
|
||||
pub struct BinaryDocumentDeserializer<'de, R> {
|
||||
length: usize,
|
||||
position: usize,
|
||||
doc_store_version: DocStoreVersion,
|
||||
reader: &'de mut R,
|
||||
}
|
||||
|
||||
@@ -298,12 +303,16 @@ impl<'de, R> BinaryDocumentDeserializer<'de, R>
|
||||
where R: Read
|
||||
{
|
||||
/// Attempts to create a new document deserializer from a given reader.
|
||||
pub(crate) fn from_reader(reader: &'de mut R) -> Result<Self, DeserializeError> {
|
||||
pub(crate) fn from_reader(
|
||||
reader: &'de mut R,
|
||||
doc_store_version: DocStoreVersion,
|
||||
) -> Result<Self, DeserializeError> {
|
||||
let length = VInt::deserialize(reader)?;
|
||||
|
||||
Ok(Self {
|
||||
length: length.val() as usize,
|
||||
position: 0,
|
||||
doc_store_version,
|
||||
reader,
|
||||
})
|
||||
}
|
||||
@@ -329,8 +338,8 @@ where R: Read
|
||||
}
|
||||
|
||||
let field = Field::deserialize(self.reader).map_err(DeserializeError::from)?;
|
||||
|
||||
let deserializer = BinaryValueDeserializer::from_reader(self.reader)?;
|
||||
let deserializer =
|
||||
BinaryValueDeserializer::from_reader(self.reader, self.doc_store_version)?;
|
||||
let value = V::deserialize(deserializer)?;
|
||||
|
||||
self.position += 1;
|
||||
@@ -344,13 +353,17 @@ where R: Read
|
||||
pub struct BinaryValueDeserializer<'de, R> {
|
||||
value_type: ValueType,
|
||||
reader: &'de mut R,
|
||||
doc_store_version: DocStoreVersion,
|
||||
}
|
||||
|
||||
impl<'de, R> BinaryValueDeserializer<'de, R>
|
||||
where R: Read
|
||||
{
|
||||
/// Attempts to create a new value deserializer from a given reader.
|
||||
fn from_reader(reader: &'de mut R) -> Result<Self, DeserializeError> {
|
||||
fn from_reader(
|
||||
reader: &'de mut R,
|
||||
doc_store_version: DocStoreVersion,
|
||||
) -> Result<Self, DeserializeError> {
|
||||
let type_code = <u8 as BinarySerializable>::deserialize(reader)?;
|
||||
|
||||
let value_type = match type_code {
|
||||
@@ -381,7 +394,7 @@ where R: Read
|
||||
type_codes::NULL_CODE => ValueType::Null,
|
||||
type_codes::ARRAY_CODE => ValueType::Array,
|
||||
type_codes::OBJECT_CODE => ValueType::Object,
|
||||
#[allow(deprecated)]
|
||||
#[expect(deprecated)]
|
||||
type_codes::JSON_OBJ_CODE => ValueType::JSONObject,
|
||||
_ => {
|
||||
return Err(DeserializeError::from(io::Error::new(
|
||||
@@ -391,7 +404,11 @@ where R: Read
|
||||
}
|
||||
};
|
||||
|
||||
Ok(Self { value_type, reader })
|
||||
Ok(Self {
|
||||
value_type,
|
||||
reader,
|
||||
doc_store_version,
|
||||
})
|
||||
}
|
||||
|
||||
fn validate_type(&self, expected_type: ValueType) -> Result<(), DeserializeError> {
|
||||
@@ -438,7 +455,16 @@ where R: Read
|
||||
|
||||
fn deserialize_datetime(self) -> Result<DateTime, DeserializeError> {
|
||||
self.validate_type(ValueType::DateTime)?;
|
||||
<DateTime as BinarySerializable>::deserialize(self.reader).map_err(DeserializeError::from)
|
||||
match self.doc_store_version {
|
||||
DocStoreVersion::V1 => {
|
||||
let timestamp_micros = <i64 as BinarySerializable>::deserialize(self.reader)?;
|
||||
Ok(DateTime::from_timestamp_micros(timestamp_micros))
|
||||
}
|
||||
DocStoreVersion::V2 => {
|
||||
let timestamp_nanos = <i64 as BinarySerializable>::deserialize(self.reader)?;
|
||||
Ok(DateTime::from_timestamp_nanos(timestamp_nanos))
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
fn deserialize_facet(self) -> Result<Facet, DeserializeError> {
|
||||
@@ -514,11 +540,13 @@ where R: Read
|
||||
visitor.visit_pre_tokenized_string(val)
|
||||
}
|
||||
ValueType::Array => {
|
||||
let access = BinaryArrayDeserializer::from_reader(self.reader)?;
|
||||
let access =
|
||||
BinaryArrayDeserializer::from_reader(self.reader, self.doc_store_version)?;
|
||||
visitor.visit_array(access)
|
||||
}
|
||||
ValueType::Object => {
|
||||
let access = BinaryObjectDeserializer::from_reader(self.reader)?;
|
||||
let access =
|
||||
BinaryObjectDeserializer::from_reader(self.reader, self.doc_store_version)?;
|
||||
visitor.visit_object(access)
|
||||
}
|
||||
#[allow(deprecated)]
|
||||
@@ -537,7 +565,8 @@ where R: Read
|
||||
|
||||
let out_rc = std::rc::Rc::new(out);
|
||||
let mut slice: &[u8] = &out_rc;
|
||||
let access = BinaryObjectDeserializer::from_reader(&mut slice)?;
|
||||
let access =
|
||||
BinaryObjectDeserializer::from_reader(&mut slice, self.doc_store_version)?;
|
||||
|
||||
visitor.visit_object(access)
|
||||
}
|
||||
@@ -551,19 +580,24 @@ pub struct BinaryArrayDeserializer<'de, R> {
|
||||
length: usize,
|
||||
position: usize,
|
||||
reader: &'de mut R,
|
||||
doc_store_version: DocStoreVersion,
|
||||
}
|
||||
|
||||
impl<'de, R> BinaryArrayDeserializer<'de, R>
|
||||
where R: Read
|
||||
{
|
||||
/// Attempts to create a new array deserializer from a given reader.
|
||||
fn from_reader(reader: &'de mut R) -> Result<Self, DeserializeError> {
|
||||
fn from_reader(
|
||||
reader: &'de mut R,
|
||||
doc_store_version: DocStoreVersion,
|
||||
) -> Result<Self, DeserializeError> {
|
||||
let length = <VInt as BinarySerializable>::deserialize(reader)?;
|
||||
|
||||
Ok(Self {
|
||||
length: length.val() as usize,
|
||||
position: 0,
|
||||
reader,
|
||||
doc_store_version,
|
||||
})
|
||||
}
|
||||
|
||||
@@ -587,7 +621,8 @@ where R: Read
|
||||
return Ok(None);
|
||||
}
|
||||
|
||||
let deserializer = BinaryValueDeserializer::from_reader(self.reader)?;
|
||||
let deserializer =
|
||||
BinaryValueDeserializer::from_reader(self.reader, self.doc_store_version)?;
|
||||
let value = V::deserialize(deserializer)?;
|
||||
|
||||
// Advance the position cursor.
|
||||
@@ -610,8 +645,11 @@ impl<'de, R> BinaryObjectDeserializer<'de, R>
|
||||
where R: Read
|
||||
{
|
||||
/// Attempts to create a new object deserializer from a given reader.
|
||||
fn from_reader(reader: &'de mut R) -> Result<Self, DeserializeError> {
|
||||
let inner = BinaryArrayDeserializer::from_reader(reader)?;
|
||||
fn from_reader(
|
||||
reader: &'de mut R,
|
||||
doc_store_version: DocStoreVersion,
|
||||
) -> Result<Self, DeserializeError> {
|
||||
let inner = BinaryArrayDeserializer::from_reader(reader, doc_store_version)?;
|
||||
Ok(Self { inner })
|
||||
}
|
||||
}
|
||||
@@ -819,6 +857,7 @@ mod tests {
|
||||
use crate::schema::document::existing_type_impls::JsonObjectIter;
|
||||
use crate::schema::document::se::BinaryValueSerializer;
|
||||
use crate::schema::document::{ReferenceValue, ReferenceValueLeaf};
|
||||
use crate::store::DOC_STORE_VERSION;
|
||||
|
||||
fn serialize_value<'a>(value: ReferenceValue<'a, &'a serde_json::Value>) -> Vec<u8> {
|
||||
let mut writer = Vec::new();
|
||||
@@ -829,9 +868,19 @@ mod tests {
|
||||
writer
|
||||
}
|
||||
|
||||
fn serialize_owned_value<'a>(value: ReferenceValue<'a, &'a OwnedValue>) -> Vec<u8> {
|
||||
let mut writer = Vec::new();
|
||||
|
||||
let mut serializer = BinaryValueSerializer::new(&mut writer);
|
||||
serializer.serialize_value(value).expect("Serialize value");
|
||||
|
||||
writer
|
||||
}
|
||||
|
||||
fn deserialize_value(buffer: Vec<u8>) -> crate::schema::OwnedValue {
|
||||
let mut cursor = Cursor::new(buffer);
|
||||
let deserializer = BinaryValueDeserializer::from_reader(&mut cursor).unwrap();
|
||||
let deserializer =
|
||||
BinaryValueDeserializer::from_reader(&mut cursor, DOC_STORE_VERSION).unwrap();
|
||||
crate::schema::OwnedValue::deserialize(deserializer).expect("Deserialize value")
|
||||
}
|
||||
|
||||
@@ -1010,6 +1059,17 @@ mod tests {
|
||||
assert_eq!(value, expected_val);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_nested_date_precision() {
|
||||
let object = OwnedValue::Object(vec![(
|
||||
"my-date".into(),
|
||||
OwnedValue::Date(DateTime::from_timestamp_nanos(323456)),
|
||||
)]);
|
||||
let result = serialize_owned_value((&object).as_value());
|
||||
let value = deserialize_value(result);
|
||||
assert_eq!(value, object);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_nested_serialize() {
|
||||
let mut object = serde_json::Map::new();
|
||||
|
||||
@@ -401,7 +401,7 @@ impl PartialEq for CompactDocValue<'_> {
|
||||
value1 == value2
|
||||
}
|
||||
}
|
||||
impl<'a> From<CompactDocValue<'a>> for OwnedValue {
|
||||
impl From<CompactDocValue<'_>> for OwnedValue {
|
||||
fn from(value: CompactDocValue) -> Self {
|
||||
value.as_value().into()
|
||||
}
|
||||
|
||||
@@ -81,6 +81,15 @@ where W: Write
|
||||
Self { writer }
|
||||
}
|
||||
|
||||
fn serialize_with_type_code<T: BinarySerializable>(
|
||||
&mut self,
|
||||
code: u8,
|
||||
val: &T,
|
||||
) -> io::Result<()> {
|
||||
self.write_type_code(code)?;
|
||||
BinarySerializable::serialize(val, self.writer)
|
||||
}
|
||||
|
||||
/// Attempts to serialize a given value and write the output
|
||||
/// to the writer.
|
||||
pub(crate) fn serialize_value<'a, V>(
|
||||
@@ -94,56 +103,38 @@ where W: Write
|
||||
ReferenceValue::Leaf(leaf) => match leaf {
|
||||
ReferenceValueLeaf::Null => self.write_type_code(type_codes::NULL_CODE),
|
||||
ReferenceValueLeaf::Str(val) => {
|
||||
self.write_type_code(type_codes::TEXT_CODE)?;
|
||||
|
||||
let temp_val = Cow::Borrowed(val);
|
||||
temp_val.serialize(self.writer)
|
||||
self.serialize_with_type_code(type_codes::TEXT_CODE, &Cow::Borrowed(val))
|
||||
}
|
||||
ReferenceValueLeaf::U64(val) => {
|
||||
self.write_type_code(type_codes::U64_CODE)?;
|
||||
|
||||
val.serialize(self.writer)
|
||||
self.serialize_with_type_code(type_codes::U64_CODE, &val)
|
||||
}
|
||||
ReferenceValueLeaf::I64(val) => {
|
||||
self.write_type_code(type_codes::I64_CODE)?;
|
||||
|
||||
val.serialize(self.writer)
|
||||
self.serialize_with_type_code(type_codes::I64_CODE, &val)
|
||||
}
|
||||
ReferenceValueLeaf::F64(val) => {
|
||||
self.write_type_code(type_codes::F64_CODE)?;
|
||||
|
||||
f64_to_u64(val).serialize(self.writer)
|
||||
self.serialize_with_type_code(type_codes::F64_CODE, &f64_to_u64(val))
|
||||
}
|
||||
ReferenceValueLeaf::Date(val) => {
|
||||
self.write_type_code(type_codes::DATE_CODE)?;
|
||||
val.serialize(self.writer)
|
||||
}
|
||||
ReferenceValueLeaf::Facet(val) => {
|
||||
self.write_type_code(type_codes::HIERARCHICAL_FACET_CODE)?;
|
||||
|
||||
Cow::Borrowed(val).serialize(self.writer)
|
||||
let timestamp_nanos: i64 = val.into_timestamp_nanos();
|
||||
BinarySerializable::serialize(×tamp_nanos, self.writer)
|
||||
}
|
||||
ReferenceValueLeaf::Facet(val) => self.serialize_with_type_code(
|
||||
type_codes::HIERARCHICAL_FACET_CODE,
|
||||
&Cow::Borrowed(val),
|
||||
),
|
||||
ReferenceValueLeaf::Bytes(val) => {
|
||||
self.write_type_code(type_codes::BYTES_CODE)?;
|
||||
|
||||
let temp_val = Cow::Borrowed(val);
|
||||
temp_val.serialize(self.writer)
|
||||
self.serialize_with_type_code(type_codes::BYTES_CODE, &Cow::Borrowed(val))
|
||||
}
|
||||
ReferenceValueLeaf::IpAddr(val) => {
|
||||
self.write_type_code(type_codes::IP_CODE)?;
|
||||
|
||||
val.to_u128().serialize(self.writer)
|
||||
self.serialize_with_type_code(type_codes::IP_CODE, &val.to_u128())
|
||||
}
|
||||
ReferenceValueLeaf::Bool(val) => {
|
||||
self.write_type_code(type_codes::BOOL_CODE)?;
|
||||
|
||||
val.serialize(self.writer)
|
||||
self.serialize_with_type_code(type_codes::BOOL_CODE, &val)
|
||||
}
|
||||
ReferenceValueLeaf::PreTokStr(val) => {
|
||||
self.write_type_code(type_codes::EXT_CODE)?;
|
||||
self.write_type_code(type_codes::TOK_STR_EXT_CODE)?;
|
||||
|
||||
val.serialize(self.writer)
|
||||
self.serialize_with_type_code(type_codes::TOK_STR_EXT_CODE, &*val)
|
||||
}
|
||||
},
|
||||
ReferenceValue::Array(elements) => {
|
||||
@@ -306,7 +297,6 @@ where W: Write
|
||||
mod tests {
|
||||
use std::collections::BTreeMap;
|
||||
|
||||
use common::DateTime;
|
||||
use serde_json::Number;
|
||||
use tokenizer_api::Token;
|
||||
|
||||
@@ -337,7 +327,10 @@ mod tests {
|
||||
$ext_code.serialize(&mut writer).unwrap();
|
||||
)?
|
||||
|
||||
$value.serialize(&mut writer).unwrap();
|
||||
BinarySerializable::serialize(
|
||||
&$value,
|
||||
&mut writer,
|
||||
).unwrap();
|
||||
)*
|
||||
|
||||
writer
|
||||
@@ -355,7 +348,10 @@ mod tests {
|
||||
$ext_code.serialize(&mut writer).unwrap();
|
||||
)?
|
||||
|
||||
$value.serialize(&mut writer).unwrap();
|
||||
BinarySerializable::serialize(
|
||||
&$value,
|
||||
&mut writer,
|
||||
).unwrap();
|
||||
)*
|
||||
|
||||
writer
|
||||
@@ -418,15 +414,6 @@ mod tests {
|
||||
"Expected serialized value to match the binary representation"
|
||||
);
|
||||
|
||||
let result = serialize_value(ReferenceValueLeaf::Date(DateTime::MAX).into());
|
||||
let expected = binary_repr!(
|
||||
type_codes::DATE_CODE => DateTime::MAX,
|
||||
);
|
||||
assert_eq!(
|
||||
result, expected,
|
||||
"Expected serialized value to match the binary representation"
|
||||
);
|
||||
|
||||
let facet = Facet::from_text("/hello/world").unwrap();
|
||||
let result = serialize_value(ReferenceValueLeaf::Facet(facet.encoded_str()).into());
|
||||
let expected = binary_repr!(
|
||||
|
||||
@@ -4,7 +4,7 @@ use std::io::{self, Read, Write};
|
||||
use std::str;
|
||||
use std::string::FromUtf8Error;
|
||||
|
||||
use common::BinarySerializable;
|
||||
use common::*;
|
||||
use once_cell::sync::Lazy;
|
||||
use regex::Regex;
|
||||
use serde::de::Error as _;
|
||||
|
||||
@@ -331,6 +331,7 @@ where B: AsRef<[u8]>
|
||||
}
|
||||
|
||||
/// ValueBytes represents a serialized value.
|
||||
///
|
||||
/// The value can be of any type of [`Type`] (e.g. string, u64, f64, bool, date, JSON).
|
||||
/// The serialized representation matches the lexicographical order of the type.
|
||||
///
|
||||
|
||||
@@ -1,5 +1,6 @@
|
||||
//! [`SnippetGenerator`]
|
||||
//! Generates a text snippet for a given document, and some highlighted parts inside it.
|
||||
//!
|
||||
//! Imagine you doing a text search in a document
|
||||
//! and want to show a preview of where in the document the search terms occur,
|
||||
//! along with some surrounding text to give context, and the search terms highlighted.
|
||||
@@ -436,7 +437,7 @@ impl SnippetGenerator {
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
pub fn terms_text(&self) -> &BTreeMap<String, Score> {
|
||||
pub(crate) fn terms_text(&self) -> &BTreeMap<String, Score> {
|
||||
&self.terms_text
|
||||
}
|
||||
|
||||
|
||||
@@ -78,7 +78,7 @@ pub struct SegmentSpaceUsage {
|
||||
}
|
||||
|
||||
impl SegmentSpaceUsage {
|
||||
#[allow(clippy::too_many_arguments)]
|
||||
#[expect(clippy::too_many_arguments)]
|
||||
pub(crate) fn new(
|
||||
num_docs: u32,
|
||||
termdict: PerFieldSpaceUsage,
|
||||
|
||||
@@ -4,7 +4,7 @@ use std::mem;
|
||||
use lz4_flex::{compress_into, decompress_into};
|
||||
|
||||
#[inline]
|
||||
#[allow(clippy::uninit_vec)]
|
||||
#[expect(clippy::uninit_vec)]
|
||||
pub fn compress(uncompressed: &[u8], compressed: &mut Vec<u8>) -> io::Result<()> {
|
||||
compressed.clear();
|
||||
let maximum_output_size =
|
||||
@@ -24,7 +24,7 @@ pub fn compress(uncompressed: &[u8], compressed: &mut Vec<u8>) -> io::Result<()>
|
||||
}
|
||||
|
||||
#[inline]
|
||||
#[allow(clippy::uninit_vec)]
|
||||
#[expect(clippy::uninit_vec)]
|
||||
pub fn decompress(compressed: &[u8], decompressed: &mut Vec<u8>) -> io::Result<()> {
|
||||
decompressed.clear();
|
||||
let uncompressed_size_bytes: &[u8; 4] = compressed
|
||||
|
||||
@@ -2,12 +2,13 @@ use std::io;
|
||||
|
||||
use common::{BinarySerializable, FixedSize, HasLen};
|
||||
|
||||
use super::{Decompressor, DOC_STORE_VERSION};
|
||||
use super::{Decompressor, DocStoreVersion, DOC_STORE_VERSION};
|
||||
use crate::directory::FileSlice;
|
||||
|
||||
#[derive(Debug, Clone, PartialEq)]
|
||||
pub struct DocStoreFooter {
|
||||
pub offset: u64,
|
||||
pub doc_store_version: DocStoreVersion,
|
||||
pub decompressor: Decompressor,
|
||||
}
|
||||
|
||||
@@ -25,9 +26,11 @@ impl BinarySerializable for DocStoreFooter {
|
||||
}
|
||||
|
||||
fn deserialize<R: io::Read>(reader: &mut R) -> io::Result<Self> {
|
||||
let doc_store_version = u32::deserialize(reader)?;
|
||||
if doc_store_version != DOC_STORE_VERSION {
|
||||
panic!("actual doc store version: {doc_store_version}, expected: {DOC_STORE_VERSION}");
|
||||
let doc_store_version = DocStoreVersion::deserialize(reader)?;
|
||||
if doc_store_version > DOC_STORE_VERSION {
|
||||
panic!(
|
||||
"actual doc store version: {doc_store_version}, max_supported: {DOC_STORE_VERSION}"
|
||||
);
|
||||
}
|
||||
let offset = u64::deserialize(reader)?;
|
||||
let compressor_id = u8::deserialize(reader)?;
|
||||
@@ -35,6 +38,7 @@ impl BinarySerializable for DocStoreFooter {
|
||||
reader.read_exact(&mut skip_buf)?;
|
||||
Ok(DocStoreFooter {
|
||||
offset,
|
||||
doc_store_version,
|
||||
decompressor: Decompressor::from_id(compressor_id),
|
||||
})
|
||||
}
|
||||
@@ -45,9 +49,14 @@ impl FixedSize for DocStoreFooter {
|
||||
}
|
||||
|
||||
impl DocStoreFooter {
|
||||
pub fn new(offset: u64, decompressor: Decompressor) -> Self {
|
||||
pub fn new(
|
||||
offset: u64,
|
||||
decompressor: Decompressor,
|
||||
doc_store_version: DocStoreVersion,
|
||||
) -> Self {
|
||||
DocStoreFooter {
|
||||
offset,
|
||||
doc_store_version,
|
||||
decompressor,
|
||||
}
|
||||
}
|
||||
|
||||
@@ -11,7 +11,7 @@ pub struct LayerCursor<'a> {
|
||||
cursor: usize,
|
||||
}
|
||||
|
||||
impl<'a> Iterator for LayerCursor<'a> {
|
||||
impl Iterator for LayerCursor<'_> {
|
||||
type Item = Checkpoint;
|
||||
|
||||
fn next(&mut self) -> Option<Checkpoint> {
|
||||
|
||||
@@ -35,15 +35,16 @@ mod footer;
|
||||
mod index;
|
||||
mod reader;
|
||||
mod writer;
|
||||
|
||||
pub use self::compressors::{Compressor, ZstdCompressor};
|
||||
pub use self::decompressors::Decompressor;
|
||||
pub(crate) use self::reader::DOCSTORE_CACHE_CAPACITY;
|
||||
pub use self::reader::{CacheStats, StoreReader};
|
||||
pub(crate) use self::reader::{DocStoreVersion, DOCSTORE_CACHE_CAPACITY};
|
||||
pub use self::writer::StoreWriter;
|
||||
mod store_compressor;
|
||||
|
||||
/// Doc store version in footer to handle format changes.
|
||||
pub(crate) const DOC_STORE_VERSION: u32 = 1;
|
||||
pub(crate) const DOC_STORE_VERSION: DocStoreVersion = DocStoreVersion::V2;
|
||||
|
||||
#[cfg(feature = "lz4-compression")]
|
||||
mod compression_lz4_block;
|
||||
@@ -52,7 +53,7 @@ mod compression_lz4_block;
|
||||
mod compression_zstd_block;
|
||||
|
||||
#[cfg(test)]
|
||||
pub mod tests {
|
||||
pub(crate) mod tests {
|
||||
|
||||
use std::path::Path;
|
||||
|
||||
|
||||
@@ -1,3 +1,4 @@
|
||||
use std::fmt::Display;
|
||||
use std::io;
|
||||
use std::iter::Sum;
|
||||
use std::num::NonZeroUsize;
|
||||
@@ -25,9 +26,43 @@ pub(crate) const DOCSTORE_CACHE_CAPACITY: usize = 100;
|
||||
|
||||
type Block = OwnedBytes;
|
||||
|
||||
/// The format version of the document store.
|
||||
#[derive(Clone, Copy, Debug, PartialEq, PartialOrd)]
|
||||
pub(crate) enum DocStoreVersion {
|
||||
V1 = 1,
|
||||
V2 = 2,
|
||||
}
|
||||
impl Display for DocStoreVersion {
|
||||
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||
match self {
|
||||
DocStoreVersion::V1 => write!(f, "V1"),
|
||||
DocStoreVersion::V2 => write!(f, "V2"),
|
||||
}
|
||||
}
|
||||
}
|
||||
impl BinarySerializable for DocStoreVersion {
|
||||
fn serialize<W: io::Write + ?Sized>(&self, writer: &mut W) -> io::Result<()> {
|
||||
(*self as u32).serialize(writer)
|
||||
}
|
||||
|
||||
fn deserialize<R: io::Read>(reader: &mut R) -> io::Result<Self> {
|
||||
Ok(match u32::deserialize(reader)? {
|
||||
1 => DocStoreVersion::V1,
|
||||
2 => DocStoreVersion::V2,
|
||||
v => {
|
||||
return Err(io::Error::new(
|
||||
io::ErrorKind::InvalidData,
|
||||
format!("Invalid doc store version {}", v),
|
||||
))
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
/// Reads document off tantivy's [`Store`](./index.html)
|
||||
pub struct StoreReader {
|
||||
decompressor: Decompressor,
|
||||
doc_store_version: DocStoreVersion,
|
||||
data: FileSlice,
|
||||
skip_index: Arc<SkipIndex>,
|
||||
space_usage: StoreSpaceUsage,
|
||||
@@ -129,6 +164,7 @@ impl StoreReader {
|
||||
let skip_index = SkipIndex::open(index_data);
|
||||
Ok(StoreReader {
|
||||
decompressor: footer.decompressor,
|
||||
doc_store_version: footer.doc_store_version,
|
||||
data: data_file,
|
||||
cache: BlockCache {
|
||||
cache: NonZeroUsize::new(cache_num_blocks)
|
||||
@@ -203,8 +239,9 @@ impl StoreReader {
|
||||
pub fn get<D: DocumentDeserialize>(&self, doc_id: DocId) -> crate::Result<D> {
|
||||
let mut doc_bytes = self.get_document_bytes(doc_id)?;
|
||||
|
||||
let deserializer = BinaryDocumentDeserializer::from_reader(&mut doc_bytes)
|
||||
.map_err(crate::TantivyError::from)?;
|
||||
let deserializer =
|
||||
BinaryDocumentDeserializer::from_reader(&mut doc_bytes, self.doc_store_version)
|
||||
.map_err(crate::TantivyError::from)?;
|
||||
D::deserialize(deserializer).map_err(crate::TantivyError::from)
|
||||
}
|
||||
|
||||
@@ -244,8 +281,9 @@ impl StoreReader {
|
||||
self.iter_raw(alive_bitset).map(|doc_bytes_res| {
|
||||
let mut doc_bytes = doc_bytes_res?;
|
||||
|
||||
let deserializer = BinaryDocumentDeserializer::from_reader(&mut doc_bytes)
|
||||
.map_err(crate::TantivyError::from)?;
|
||||
let deserializer =
|
||||
BinaryDocumentDeserializer::from_reader(&mut doc_bytes, self.doc_store_version)
|
||||
.map_err(crate::TantivyError::from)?;
|
||||
D::deserialize(deserializer).map_err(crate::TantivyError::from)
|
||||
})
|
||||
}
|
||||
@@ -391,8 +429,9 @@ impl StoreReader {
|
||||
) -> crate::Result<D> {
|
||||
let mut doc_bytes = self.get_document_bytes_async(doc_id, executor).await?;
|
||||
|
||||
let deserializer = BinaryDocumentDeserializer::from_reader(&mut doc_bytes)
|
||||
.map_err(crate::TantivyError::from)?;
|
||||
let deserializer =
|
||||
BinaryDocumentDeserializer::from_reader(&mut doc_bytes, self.doc_store_version)
|
||||
.map_err(crate::TantivyError::from)?;
|
||||
D::deserialize(deserializer).map_err(crate::TantivyError::from)
|
||||
}
|
||||
}
|
||||
@@ -414,6 +453,11 @@ mod tests {
|
||||
doc.get_first(*field).and_then(|f| f.as_value().as_str())
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_doc_store_version_ord() {
|
||||
assert!(DocStoreVersion::V1 < DocStoreVersion::V2);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_store_lru_cache() -> crate::Result<()> {
|
||||
let directory = RamDirectory::create();
|
||||
|
||||
@@ -5,6 +5,7 @@ use std::{io, thread};
|
||||
|
||||
use common::{BinarySerializable, CountingWriter, TerminatingWrite};
|
||||
|
||||
use super::DOC_STORE_VERSION;
|
||||
use crate::directory::WritePtr;
|
||||
use crate::store::footer::DocStoreFooter;
|
||||
use crate::store::index::{Checkpoint, SkipIndexBuilder};
|
||||
@@ -143,8 +144,11 @@ impl BlockCompressorImpl {
|
||||
|
||||
fn close(mut self) -> io::Result<()> {
|
||||
let header_offset: u64 = self.writer.written_bytes();
|
||||
let docstore_footer =
|
||||
DocStoreFooter::new(header_offset, Decompressor::from(self.compressor));
|
||||
let docstore_footer = DocStoreFooter::new(
|
||||
header_offset,
|
||||
Decompressor::from(self.compressor),
|
||||
DOC_STORE_VERSION,
|
||||
);
|
||||
self.offset_index_writer.serialize_into(&mut self.writer)?;
|
||||
docstore_footer.serialize(&mut self.writer)?;
|
||||
self.writer.terminate()
|
||||
|
||||
@@ -82,7 +82,7 @@ where A: Automaton
|
||||
current_value: TermInfo,
|
||||
}
|
||||
|
||||
impl<'a, A> TermStreamer<'a, A>
|
||||
impl<A> TermStreamer<'_, A>
|
||||
where A: Automaton
|
||||
{
|
||||
/// Advance position the stream on the next item.
|
||||
@@ -136,7 +136,7 @@ where A: Automaton
|
||||
}
|
||||
|
||||
/// Return the next `(key, value)` pair.
|
||||
#[allow(clippy::should_implement_trait)]
|
||||
#[expect(clippy::should_implement_trait)]
|
||||
pub fn next(&mut self) -> Option<(&[u8], &TermInfo)> {
|
||||
if self.advance() {
|
||||
Some((self.key(), self.value()))
|
||||
|
||||
@@ -49,7 +49,6 @@ use crate::postings::TermInfo;
|
||||
|
||||
#[derive(Debug, Eq, PartialEq)]
|
||||
#[repr(u32)]
|
||||
#[allow(dead_code)]
|
||||
enum DictionaryType {
|
||||
Fst = 1,
|
||||
SSTable = 2,
|
||||
|
||||
@@ -42,7 +42,7 @@ pub struct AsciiFoldingFilterTokenStream<'a, T> {
|
||||
tail: T,
|
||||
}
|
||||
|
||||
impl<'a, T: TokenStream> TokenStream for AsciiFoldingFilterTokenStream<'a, T> {
|
||||
impl<T: TokenStream> TokenStream for AsciiFoldingFilterTokenStream<'_, T> {
|
||||
fn advance(&mut self) -> bool {
|
||||
if !self.tail.advance() {
|
||||
return false;
|
||||
|
||||
@@ -40,7 +40,7 @@ impl Tokenizer for FacetTokenizer {
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a> TokenStream for FacetTokenStream<'a> {
|
||||
impl TokenStream for FacetTokenStream<'_> {
|
||||
fn advance(&mut self) -> bool {
|
||||
match self.state {
|
||||
State::RootFacetNotEmitted => {
|
||||
|
||||
@@ -51,7 +51,7 @@ fn to_lowercase_unicode(text: &str, output: &mut String) {
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a, T: TokenStream> TokenStream for LowerCaserTokenStream<'a, T> {
|
||||
impl<T: TokenStream> TokenStream for LowerCaserTokenStream<'_, T> {
|
||||
fn advance(&mut self) -> bool {
|
||||
if !self.tail.advance() {
|
||||
return false;
|
||||
|
||||
@@ -166,7 +166,7 @@ pub use self::whitespace_tokenizer::WhitespaceTokenizer;
|
||||
pub const MAX_TOKEN_LEN: usize = u16::MAX as usize - 5;
|
||||
|
||||
#[cfg(test)]
|
||||
pub mod tests {
|
||||
pub(crate) mod tests {
|
||||
use super::{
|
||||
Language, LowerCaser, RemoveLongFilter, SimpleTokenizer, Stemmer, Token, TokenizerManager,
|
||||
};
|
||||
|
||||
@@ -159,7 +159,7 @@ impl Tokenizer for NgramTokenizer {
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a> TokenStream for NgramTokenStream<'a> {
|
||||
impl TokenStream for NgramTokenStream<'_> {
|
||||
fn advance(&mut self) -> bool {
|
||||
if let Some((offset_from, offset_to)) = self.ngram_charidx_iterator.next() {
|
||||
if self.prefix_only && offset_from > 0 {
|
||||
@@ -283,7 +283,7 @@ impl<'a> CodepointFrontiers<'a> {
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a> Iterator for CodepointFrontiers<'a> {
|
||||
impl Iterator for CodepointFrontiers<'_> {
|
||||
type Item = usize;
|
||||
|
||||
fn next(&mut self) -> Option<usize> {
|
||||
|
||||
@@ -28,7 +28,7 @@ impl Tokenizer for RawTokenizer {
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a> TokenStream for RawTokenStream<'a> {
|
||||
impl TokenStream for RawTokenStream<'_> {
|
||||
fn advance(&mut self) -> bool {
|
||||
let result = self.has_token;
|
||||
self.has_token = false;
|
||||
|
||||
@@ -4,6 +4,7 @@ use super::{Token, TokenStream, Tokenizer};
|
||||
use crate::TantivyError;
|
||||
|
||||
/// Tokenize the text by using a regex pattern to split.
|
||||
///
|
||||
/// Each match of the regex emits a distinct token, empty tokens will not be emitted. Anchors such
|
||||
/// as `\A` will match the text from the part where the last token was emitted or the beginning of
|
||||
/// the complete text if no token was emitted yet.
|
||||
@@ -83,7 +84,7 @@ pub struct RegexTokenStream<'a> {
|
||||
cursor: usize,
|
||||
}
|
||||
|
||||
impl<'a> TokenStream for RegexTokenStream<'a> {
|
||||
impl TokenStream for RegexTokenStream<'_> {
|
||||
fn advance(&mut self) -> bool {
|
||||
let Some(regex_match) = self.regex.find(self.text) else {
|
||||
return false;
|
||||
|
||||
@@ -27,7 +27,7 @@ impl Tokenizer for SimpleTokenizer {
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a> SimpleTokenStream<'a> {
|
||||
impl SimpleTokenStream<'_> {
|
||||
// search for the end of the current token.
|
||||
fn search_token_end(&mut self) -> usize {
|
||||
(&mut self.chars)
|
||||
@@ -38,7 +38,7 @@ impl<'a> SimpleTokenStream<'a> {
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a> TokenStream for SimpleTokenStream<'a> {
|
||||
impl TokenStream for SimpleTokenStream<'_> {
|
||||
fn advance(&mut self) -> bool {
|
||||
self.token.text.clear();
|
||||
self.token.position = self.token.position.wrapping_add(1);
|
||||
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user