mirror of
https://github.com/quickwit-oss/tantivy.git
synced 2026-01-07 17:42:55 +00:00
Compare commits
5 Commits
hfb/phrase
...
unused-dep
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
4823771228 | ||
|
|
1db76dd9cf | ||
|
|
467a9517db | ||
|
|
b361315a67 | ||
|
|
4e3771bffc |
@@ -12,8 +12,6 @@ description = "Fast field codecs used by tantivy"
|
||||
common = { path = "../common/" }
|
||||
tantivy-bitpacker = { path = "../bitpacker/" }
|
||||
prettytable-rs = {version="0.8.0", optional= true}
|
||||
#prettytable-rs = {version="0.8.0" }
|
||||
rand = "0.8.3"
|
||||
|
||||
[dev-dependencies]
|
||||
more-asserts = "0.2.1"
|
||||
|
||||
@@ -16,7 +16,7 @@ use crate::fastfield::{DynamicFastFieldReader, FastFieldReader, FastValue};
|
||||
use crate::schema::Field;
|
||||
use crate::{Score, SegmentReader, TantivyError};
|
||||
|
||||
/// The `FilterCollector` collector filters docs using a u64 fast field value and a predicate.
|
||||
/// The `FilterCollector` collector filters docs using a fast field value and a predicate.
|
||||
/// Only the documents for which the predicate returned "true" will be passed on to the next collector.
|
||||
///
|
||||
/// ```rust
|
||||
|
||||
@@ -31,9 +31,6 @@ use std::{collections::HashMap, io};
|
||||
///
|
||||
/// The segment reader has a very low memory footprint,
|
||||
/// as close to all of the memory data is mmapped.
|
||||
///
|
||||
///
|
||||
/// TODO fix not decoding docfreq
|
||||
#[derive(Clone)]
|
||||
pub struct SegmentReader {
|
||||
inv_idx_reader_cache: Arc<RwLock<HashMap<Field, Arc<InvertedIndexReader>>>>,
|
||||
@@ -56,15 +53,12 @@ pub struct SegmentReader {
|
||||
impl SegmentReader {
|
||||
/// Returns the highest document id ever attributed in
|
||||
/// this segment + 1.
|
||||
/// Today, `tantivy` does not handle deletes, so it happens
|
||||
/// to also be the number of documents in the index.
|
||||
pub fn max_doc(&self) -> DocId {
|
||||
self.max_doc
|
||||
}
|
||||
|
||||
/// Returns the number of alive documents.
|
||||
/// Deleted documents are not counted.
|
||||
///
|
||||
pub fn num_docs(&self) -> DocId {
|
||||
self.num_docs
|
||||
}
|
||||
|
||||
@@ -179,87 +179,6 @@ pub mod tests {
|
||||
assert_nearly_equals!(scores[1], 0.46844664);
|
||||
}
|
||||
|
||||
#[ignore]
|
||||
#[test]
|
||||
pub fn test_phrase_score_with_slop() {
|
||||
let index = create_index(&["a c b", "a b c a b"]);
|
||||
let schema = index.schema();
|
||||
let text_field = schema.get_field("text").unwrap();
|
||||
let searcher = index.reader().unwrap().searcher();
|
||||
let test_query = |texts: Vec<&str>| {
|
||||
let terms: Vec<Term> = texts
|
||||
.iter()
|
||||
.map(|text| Term::from_field_text(text_field, text))
|
||||
.collect();
|
||||
let mut phrase_query = PhraseQuery::new(terms);
|
||||
phrase_query.slop(1);
|
||||
searcher
|
||||
.search(&phrase_query, &TEST_COLLECTOR_WITH_SCORE)
|
||||
.expect("search should succeed")
|
||||
.scores()
|
||||
.to_vec()
|
||||
};
|
||||
let scores = test_query(vec!["a", "b"]);
|
||||
assert_nearly_equals!(scores[0], 0.40618482);
|
||||
assert_nearly_equals!(scores[1], 0.46844664);
|
||||
}
|
||||
|
||||
#[test]
|
||||
pub fn test_phrase_score_with_slop_size() {
|
||||
let index = create_index(&["a b e c", "a e e e c", "a e e e e c"]);
|
||||
let schema = index.schema();
|
||||
let text_field = schema.get_field("text").unwrap();
|
||||
let searcher = index.reader().unwrap().searcher();
|
||||
let test_query = |texts: Vec<&str>| {
|
||||
let terms: Vec<Term> = texts
|
||||
.iter()
|
||||
.map(|text| Term::from_field_text(text_field, text))
|
||||
.collect();
|
||||
let mut phrase_query = PhraseQuery::new(terms);
|
||||
phrase_query.slop(3);
|
||||
searcher
|
||||
.search(&phrase_query, &TEST_COLLECTOR_WITH_SCORE)
|
||||
.expect("search should succeed")
|
||||
.scores()
|
||||
.to_vec()
|
||||
};
|
||||
let scores = test_query(vec!["a", "c"]);
|
||||
assert_nearly_equals!(scores[0], 0.29086056);
|
||||
assert_nearly_equals!(scores[1], 0.26706287);
|
||||
}
|
||||
|
||||
#[test]
|
||||
pub fn test_phrase_score_with_slop_ordering() {
|
||||
let index = create_index(&[
|
||||
"a e b e c",
|
||||
"a e e e e e b e e e e c",
|
||||
"a c b",
|
||||
"a c e b e",
|
||||
"a e c b",
|
||||
"a e b c",
|
||||
]);
|
||||
let schema = index.schema();
|
||||
let text_field = schema.get_field("text").unwrap();
|
||||
let searcher = index.reader().unwrap().searcher();
|
||||
let test_query = |texts: Vec<&str>| {
|
||||
let terms: Vec<Term> = texts
|
||||
.iter()
|
||||
.map(|text| Term::from_field_text(text_field, text))
|
||||
.collect();
|
||||
let mut phrase_query = PhraseQuery::new(terms);
|
||||
phrase_query.slop(3);
|
||||
searcher
|
||||
.search(&phrase_query, &TEST_COLLECTOR_WITH_SCORE)
|
||||
.expect("search should succeed")
|
||||
.scores()
|
||||
.to_vec()
|
||||
};
|
||||
let scores = test_query(vec!["a", "b", "c"]);
|
||||
// The first and last matches.
|
||||
assert_nearly_equals!(scores[0], 0.23091172);
|
||||
assert_nearly_equals!(scores[1], 0.25024384);
|
||||
}
|
||||
|
||||
#[test] // motivated by #234
|
||||
pub fn test_phrase_query_docfreq_order() {
|
||||
let mut schema_builder = Schema::builder();
|
||||
|
||||
@@ -26,7 +26,6 @@ use crate::schema::{Field, Term};
|
||||
pub struct PhraseQuery {
|
||||
field: Field,
|
||||
phrase_terms: Vec<(usize, Term)>,
|
||||
slop: u32,
|
||||
}
|
||||
|
||||
impl PhraseQuery {
|
||||
@@ -57,15 +56,9 @@ impl PhraseQuery {
|
||||
PhraseQuery {
|
||||
field,
|
||||
phrase_terms: terms,
|
||||
slop: 0,
|
||||
}
|
||||
}
|
||||
|
||||
/// Slop allowed for the phrase.
|
||||
pub fn slop(&mut self, value: u32) {
|
||||
self.slop = value;
|
||||
}
|
||||
|
||||
/// The `Field` this `PhraseQuery` is targeting.
|
||||
pub fn field(&self) -> Field {
|
||||
self.field
|
||||
@@ -104,11 +97,11 @@ impl PhraseQuery {
|
||||
}
|
||||
let terms = self.phrase_terms();
|
||||
let bm25_weight = Bm25Weight::for_terms(searcher, &terms)?;
|
||||
let mut weight = PhraseWeight::new(self.phrase_terms.clone(), bm25_weight, scoring_enabled);
|
||||
if self.slop > 0 {
|
||||
weight.slop(self.slop);
|
||||
}
|
||||
Ok(weight)
|
||||
Ok(PhraseWeight::new(
|
||||
self.phrase_terms.clone(),
|
||||
bm25_weight,
|
||||
scoring_enabled,
|
||||
))
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -51,7 +51,6 @@ pub struct PhraseScorer<TPostings: Postings> {
|
||||
fieldnorm_reader: FieldNormReader,
|
||||
similarity_weight: Bm25Weight,
|
||||
scoring_enabled: bool,
|
||||
slop: u32,
|
||||
}
|
||||
|
||||
/// Returns true iff the two sorted array contain a common element
|
||||
@@ -131,82 +130,12 @@ fn intersection(left: &mut [u32], right: &[u32]) -> usize {
|
||||
count
|
||||
}
|
||||
|
||||
/// Intersect twos sorted arrays `left` and `right` and outputs the
|
||||
/// resulting array in left.
|
||||
///
|
||||
/// Condition for match is that the value stored in left is less than the value in right,
|
||||
/// and that the difference between the value in left_begin is within
|
||||
///
|
||||
/// Returns the length of the intersection
|
||||
fn intersection_with_distance(
|
||||
left: &mut [u32],
|
||||
left_begin: &mut [u32],
|
||||
right: &mut [u32],
|
||||
max_distance_to_begin: u32,
|
||||
) -> usize {
|
||||
// TODO: Improve variable names?
|
||||
let mut left_i = 0;
|
||||
let mut right_i = 0;
|
||||
let mut count = 0;
|
||||
let left_len = left.len();
|
||||
let right_len = right.len();
|
||||
// Is the current last value guaranteed to be the final value.
|
||||
let mut is_temporary = false;
|
||||
while left_i < left_len && right_i < right_len {
|
||||
let left_val = left[left_i];
|
||||
let right_val = right[right_i];
|
||||
match left_val.cmp(&right_val) {
|
||||
Ordering::Less => {
|
||||
if right_val - left_begin[left_i] <= max_distance_to_begin {
|
||||
if is_temporary {
|
||||
// If the value was temporary we have found a closer match.
|
||||
count -= 1;
|
||||
};
|
||||
left[count] = left_val;
|
||||
left_begin[count] = left_begin[left_i];
|
||||
right[count] = right_val;
|
||||
count += 1;
|
||||
left_i += 1;
|
||||
// Still possible to find a closer match.
|
||||
is_temporary = true;
|
||||
} else {
|
||||
left_i += 1;
|
||||
}
|
||||
}
|
||||
Ordering::Equal => {
|
||||
if is_temporary {
|
||||
// If the value was temporary we have found an.
|
||||
count -= 1;
|
||||
is_temporary = false;
|
||||
}
|
||||
left[count] = left_val;
|
||||
left_begin[count] = left_begin[left_i];
|
||||
right[count] = right_val;
|
||||
count += 1;
|
||||
left_i += 1;
|
||||
right_i += 1;
|
||||
}
|
||||
Ordering::Greater => {
|
||||
right_i += 1;
|
||||
// Given the constraint that left cannot be greater than right we know that the value in left is
|
||||
// final.
|
||||
is_temporary = false;
|
||||
}
|
||||
}
|
||||
}
|
||||
for i in 0..count {
|
||||
left[i] = right[i];
|
||||
}
|
||||
count
|
||||
}
|
||||
|
||||
impl<TPostings: Postings> PhraseScorer<TPostings> {
|
||||
pub fn new(
|
||||
term_postings: Vec<(usize, TPostings)>,
|
||||
similarity_weight: Bm25Weight,
|
||||
fieldnorm_reader: FieldNormReader,
|
||||
scoring_enabled: bool,
|
||||
slop: u32,
|
||||
) -> PhraseScorer<TPostings> {
|
||||
let max_offset = term_postings
|
||||
.iter()
|
||||
@@ -229,7 +158,6 @@ impl<TPostings: Postings> PhraseScorer<TPostings> {
|
||||
similarity_weight,
|
||||
fieldnorm_reader,
|
||||
scoring_enabled,
|
||||
slop,
|
||||
};
|
||||
if scorer.doc() != TERMINATED && !scorer.phrase_match() {
|
||||
scorer.advance();
|
||||
@@ -242,7 +170,6 @@ impl<TPostings: Postings> PhraseScorer<TPostings> {
|
||||
}
|
||||
|
||||
fn phrase_match(&mut self) -> bool {
|
||||
// Need to add support for slop in phrase_count and phrase_exists.
|
||||
if self.scoring_enabled {
|
||||
let count = self.compute_phrase_count();
|
||||
self.phrase_count = count;
|
||||
@@ -253,25 +180,29 @@ impl<TPostings: Postings> PhraseScorer<TPostings> {
|
||||
}
|
||||
|
||||
fn phrase_exists(&mut self) -> bool {
|
||||
let intersection_len = if self.has_slop() {
|
||||
self.compute_match_with_slop()
|
||||
} else {
|
||||
self.compute_match()
|
||||
};
|
||||
self.intersection_docset
|
||||
.docset_mut_specialized(0)
|
||||
.positions(&mut self.left);
|
||||
let mut intersection_len = self.left.len();
|
||||
for i in 1..self.num_terms - 1 {
|
||||
{
|
||||
self.intersection_docset
|
||||
.docset_mut_specialized(i)
|
||||
.positions(&mut self.right);
|
||||
}
|
||||
intersection_len = intersection(&mut self.left[..intersection_len], &self.right[..]);
|
||||
if intersection_len == 0 {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
self.intersection_docset
|
||||
.docset_mut_specialized(self.num_terms - 1)
|
||||
.positions(&mut self.right);
|
||||
intersection_exists(&self.left[..intersection_len], &self.right[..])
|
||||
}
|
||||
|
||||
fn compute_phrase_count(&mut self) -> u32 {
|
||||
let intersection_len = if self.has_slop() {
|
||||
self.compute_match_with_slop()
|
||||
} else {
|
||||
self.compute_match()
|
||||
};
|
||||
intersection_count(&self.left[..intersection_len], &self.right[..]) as u32
|
||||
}
|
||||
|
||||
/// Computes match without slop.
|
||||
fn compute_match(&mut self) -> usize {
|
||||
{
|
||||
self.intersection_docset
|
||||
.docset_mut_specialized(0)
|
||||
@@ -286,52 +217,14 @@ impl<TPostings: Postings> PhraseScorer<TPostings> {
|
||||
}
|
||||
intersection_len = intersection(&mut self.left[..intersection_len], &self.right[..]);
|
||||
if intersection_len == 0 {
|
||||
return 0;
|
||||
return 0u32;
|
||||
}
|
||||
}
|
||||
|
||||
self.intersection_docset
|
||||
.docset_mut_specialized(self.num_terms - 1)
|
||||
.positions(&mut self.right);
|
||||
intersection_len
|
||||
}
|
||||
|
||||
// Computes match with slop.
|
||||
fn compute_match_with_slop(&mut self) -> usize {
|
||||
{
|
||||
self.intersection_docset
|
||||
.docset_mut_specialized(0)
|
||||
.positions(&mut self.left);
|
||||
}
|
||||
let mut intersection_len = self.left.len();
|
||||
// We'll increment the values to be equal to the next match in the right array to achieve ordered slop.
|
||||
let mut left_begin_vec = self.left.clone();
|
||||
let left_begin = &mut left_begin_vec[..];
|
||||
for i in 1..self.num_terms {
|
||||
{
|
||||
self.intersection_docset
|
||||
.docset_mut_specialized(i)
|
||||
.positions(&mut self.right);
|
||||
}
|
||||
intersection_len = intersection_with_distance(
|
||||
&mut self.left[..intersection_len],
|
||||
&mut left_begin[..intersection_len],
|
||||
&mut self.right[..],
|
||||
self.slop,
|
||||
);
|
||||
// Update the left to be equal to the right. Merge the initial left.
|
||||
if intersection_len == 0 {
|
||||
return 0;
|
||||
}
|
||||
}
|
||||
self.intersection_docset
|
||||
.docset_mut_specialized(self.num_terms - 1)
|
||||
.positions(&mut self.right);
|
||||
intersection_len
|
||||
}
|
||||
|
||||
fn has_slop(&self) -> bool {
|
||||
self.slop > 0
|
||||
intersection_count(&self.left[..intersection_len], &self.right[..]) as u32
|
||||
}
|
||||
}
|
||||
|
||||
@@ -374,28 +267,18 @@ impl<TPostings: Postings> Scorer for PhraseScorer<TPostings> {
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::{intersection, intersection_count, intersection_with_distance};
|
||||
use super::{intersection, intersection_count};
|
||||
|
||||
fn test_intersection_sym(left: &[u32], right: &[u32], expected: &[u32]) {
|
||||
test_intersection_aux(left, right, expected, 0);
|
||||
test_intersection_aux(right, left, expected, 0);
|
||||
test_intersection_aux(left, right, expected);
|
||||
test_intersection_aux(right, left, expected);
|
||||
}
|
||||
|
||||
fn test_intersection_aux(left: &[u32], right: &[u32], expected: &[u32], slop: u32) {
|
||||
fn test_intersection_aux(left: &[u32], right: &[u32], expected: &[u32]) {
|
||||
let mut left_vec = Vec::from(left);
|
||||
let left_mut = &mut left_vec[..];
|
||||
if slop == 0 {
|
||||
let left_mut = &mut left_vec[..];
|
||||
assert_eq!(intersection_count(left_mut, right), expected.len());
|
||||
let count = intersection(left_mut, right);
|
||||
assert_eq!(&left_mut[..count], expected);
|
||||
return;
|
||||
}
|
||||
let mut right_vec = Vec::from(right);
|
||||
let right_mut = &mut right_vec[..];
|
||||
let mut left_begin_vec = Vec::from(left);
|
||||
let left_begin_mut = &mut left_begin_vec[..];
|
||||
let count = intersection_with_distance(left_mut, left_begin_mut, right_mut, slop);
|
||||
assert_eq!(intersection_count(left_mut, right), expected.len());
|
||||
let count = intersection(left_mut, right);
|
||||
assert_eq!(&left_mut[..count], expected);
|
||||
}
|
||||
|
||||
@@ -407,51 +290,6 @@ mod tests {
|
||||
test_intersection_sym(&[5, 7], &[1, 5, 10, 12], &[5]);
|
||||
test_intersection_sym(&[1, 5, 6, 9, 10, 12], &[6, 8, 9, 12], &[6, 9, 12]);
|
||||
}
|
||||
#[test]
|
||||
fn test_slop() {
|
||||
// The slop is not symetric. It does not allow for the phrase to be out of order.
|
||||
test_intersection_aux(&[1], &[2], &[2], 1);
|
||||
test_intersection_aux(&[1], &[3], &[], 1);
|
||||
test_intersection_aux(&[1], &[3], &[3], 2);
|
||||
test_intersection_aux(&[], &[2], &[], 100000);
|
||||
test_intersection_aux(&[5, 7, 11], &[1, 5, 10, 12], &[5, 12], 1);
|
||||
test_intersection_aux(&[1, 5, 6, 9, 10, 12], &[6, 8, 9, 12], &[6, 9, 12], 1);
|
||||
test_intersection_aux(&[1, 5, 6, 9, 10, 12], &[6, 8, 9, 12], &[6, 9, 12], 10);
|
||||
}
|
||||
|
||||
fn test_merge(
|
||||
left: &[u32],
|
||||
right: &[u32],
|
||||
left_begin: &[u32],
|
||||
expected_left: &[u32],
|
||||
expected_left_begin: &[u32],
|
||||
slop: u32,
|
||||
) {
|
||||
let mut left_vec = Vec::from(left);
|
||||
let left_mut = &mut left_vec[..];
|
||||
let mut right_vec = Vec::from(right);
|
||||
let right_mut = &mut right_vec[..];
|
||||
let mut left_begin_vec = Vec::from(left_begin);
|
||||
let left_begin_mut = &mut left_begin_vec[..];
|
||||
let count = intersection_with_distance(left_mut, left_begin_mut, right_mut, slop);
|
||||
assert_eq!(&left_mut[..count], expected_left);
|
||||
assert_eq!(&left_begin_mut[..count], expected_left_begin);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_merge_slop() {
|
||||
test_merge(&[1, 2], &[1], &[0, 1], &[1], &[0], 1);
|
||||
test_merge(&[3], &[4], &[2], &[4], &[2], 2);
|
||||
test_merge(&[3], &[4], &[2], &[4], &[2], 2);
|
||||
test_merge(
|
||||
&[1, 5, 6, 9, 10, 12],
|
||||
&[6, 8, 9, 12],
|
||||
&[0, 1, 2, 3, 4, 5],
|
||||
&[6, 9, 12],
|
||||
&[2, 3, 5],
|
||||
10,
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(all(test, feature = "unstable"))]
|
||||
|
||||
@@ -16,7 +16,6 @@ pub struct PhraseWeight {
|
||||
phrase_terms: Vec<(usize, Term)>,
|
||||
similarity_weight: Bm25Weight,
|
||||
scoring_enabled: bool,
|
||||
slop: u32,
|
||||
}
|
||||
|
||||
impl PhraseWeight {
|
||||
@@ -26,12 +25,10 @@ impl PhraseWeight {
|
||||
similarity_weight: Bm25Weight,
|
||||
scoring_enabled: bool,
|
||||
) -> PhraseWeight {
|
||||
let slop = 0;
|
||||
PhraseWeight {
|
||||
phrase_terms,
|
||||
similarity_weight,
|
||||
scoring_enabled,
|
||||
slop,
|
||||
}
|
||||
}
|
||||
|
||||
@@ -80,13 +77,8 @@ impl PhraseWeight {
|
||||
similarity_weight,
|
||||
fieldnorm_reader,
|
||||
self.scoring_enabled,
|
||||
self.slop,
|
||||
)))
|
||||
}
|
||||
|
||||
pub fn slop(&mut self, slop: u32) {
|
||||
self.slop = slop;
|
||||
}
|
||||
}
|
||||
|
||||
impl Weight for PhraseWeight {
|
||||
|
||||
Reference in New Issue
Block a user