allow slop in both directions (#2020)

* allow slop in both directions

allow slop in both directions
so "big wolf"~3 can also match "wolf big"

This also fixes #1934, when the docsets were reordered by size and didn't
match the terms.

* remove count

* add test for repeating tokens, unduplicate tests
This commit is contained in:
PSeitz
2023-05-07 11:05:21 +08:00
committed by GitHub
parent 85df322ceb
commit 4c58b0086d
2 changed files with 138 additions and 199 deletions

View File

@@ -160,22 +160,7 @@ pub mod tests {
#[test]
pub fn test_phrase_score() -> crate::Result<()> {
let index = create_index(&["a b c", "a b c a b"])?;
let schema = index.schema();
let text_field = schema.get_field("text").unwrap();
let searcher = index.reader()?.searcher();
let test_query = |texts: Vec<&str>| {
let terms: Vec<Term> = texts
.iter()
.map(|text| Term::from_field_text(text_field, text))
.collect();
let phrase_query = PhraseQuery::new(terms);
searcher
.search(&phrase_query, &TEST_COLLECTOR_WITH_SCORE)
.expect("search should succeed")
.scores()
.to_vec()
};
let scores = test_query(vec!["a", "b"]);
let scores = test_query(0, &index, vec!["a", "b"]);
assert_nearly_equals!(scores[0], 0.40618482);
assert_nearly_equals!(scores[1], 0.46844664);
Ok(())
@@ -185,48 +170,48 @@ pub mod tests {
#[test]
pub fn test_phrase_score_with_slop() -> crate::Result<()> {
let index = create_index(&["a c b", "a b c a b"])?;
let schema = index.schema();
let text_field = schema.get_field("text").unwrap();
let searcher = index.reader().unwrap().searcher();
let test_query = |texts: Vec<&str>| {
let terms: Vec<Term> = texts
.iter()
.map(|text| Term::from_field_text(text_field, text))
.collect();
let mut phrase_query = PhraseQuery::new(terms);
phrase_query.set_slop(1);
searcher
.search(&phrase_query, &TEST_COLLECTOR_WITH_SCORE)
.expect("search should succeed")
.scores()
.to_vec()
};
let scores = test_query(vec!["a", "b"]);
let scores = test_query(1, &index, vec!["a", "b"]);
assert_nearly_equals!(scores[0], 0.40618482);
assert_nearly_equals!(scores[1], 0.46844664);
Ok(())
}
#[test]
pub fn test_phrase_score_with_slop_bug() -> crate::Result<()> {
let index = create_index(&["asdf asdf Captain Subject Wendy", "Captain"])?;
let scores = test_query(1, &index, vec!["captain", "wendy"]);
assert_eq!(scores.len(), 1);
Ok(())
}
fn test_query(slop: u32, index: &Index, texts: Vec<&str>) -> Vec<f32> {
let text_field = index.schema().get_field("text").unwrap();
let searcher = index.reader().unwrap().searcher();
let terms: Vec<Term> = texts
.iter()
.map(|text| Term::from_field_text(text_field, text))
.collect();
let mut phrase_query = PhraseQuery::new(terms);
phrase_query.set_slop(slop);
searcher
.search(&phrase_query, &TEST_COLLECTOR_WITH_SCORE)
.expect("search should succeed")
.scores()
.to_vec()
}
#[test]
pub fn test_phrase_score_with_slop_repeating() -> crate::Result<()> {
let index = create_index(&["wendy subject subject captain", "Captain"])?;
let scores = test_query(1, &index, vec!["wendy", "subject", "captain"]);
assert_eq!(scores.len(), 1);
Ok(())
}
#[test]
pub fn test_phrase_score_with_slop_size() -> crate::Result<()> {
let index = create_index(&["a b e c", "a e e e c", "a e e e e c"])?;
let schema = index.schema();
let text_field = schema.get_field("text").unwrap();
let searcher = index.reader().unwrap().searcher();
let test_query = |texts: Vec<&str>| {
let terms: Vec<Term> = texts
.iter()
.map(|text| Term::from_field_text(text_field, text))
.collect();
let mut phrase_query = PhraseQuery::new(terms);
phrase_query.set_slop(3);
searcher
.search(&phrase_query, &TEST_COLLECTOR_WITH_SCORE)
.expect("search should succeed")
.scores()
.to_vec()
};
let scores = test_query(vec!["a", "c"]);
let scores = test_query(3, &index, vec!["a", "c"]);
assert_nearly_equals!(scores[0], 0.29086056);
assert_nearly_equals!(scores[1], 0.26706287);
Ok(())
@@ -237,31 +222,16 @@ pub mod tests {
let index = create_index(&[
"a e b e c",
"a e e e e e b e e e e c",
"a c b",
"a c b", // also matches
"a c e b e",
"a e c b",
"a e b c",
])?;
let schema = index.schema();
let text_field = schema.get_field("text").unwrap();
let searcher = index.reader().unwrap().searcher();
let test_query = |texts: Vec<&str>| {
let terms: Vec<Term> = texts
.iter()
.map(|text| Term::from_field_text(text_field, text))
.collect();
let mut phrase_query = PhraseQuery::new(terms);
phrase_query.set_slop(3);
searcher
.search(&phrase_query, &TEST_COLLECTOR_WITH_SCORE)
.expect("search should succeed")
.scores()
.to_vec()
};
let scores = test_query(vec!["a", "b", "c"]);
let scores = test_query(3, &index, vec!["a", "b", "c"]);
// The first and last matches.
assert_nearly_equals!(scores[0], 0.23091172);
assert_nearly_equals!(scores[1], 0.25024384);
assert_nearly_equals!(scores[1], 0.27310878);
assert_nearly_equals!(scores[3], 0.25024384);
Ok(())
}

View File

@@ -46,8 +46,8 @@ impl<TPostings: Postings> DocSet for PostingsWithOffset<TPostings> {
pub struct PhraseScorer<TPostings: Postings> {
intersection_docset: Intersection<PostingsWithOffset<TPostings>, PostingsWithOffset<TPostings>>,
num_terms: usize,
left: Vec<u32>,
right: Vec<u32>,
left_positions: Vec<u32>,
right_positions: Vec<u32>,
phrase_count: u32,
fieldnorm_reader: FieldNormReader,
similarity_weight_opt: Option<Bm25Weight>,
@@ -104,7 +104,8 @@ pub(crate) fn intersection_count(left: &[u32], right: &[u32]) -> usize {
/// resulting array in left.
///
/// Returns the length of the intersection
fn intersection(left: &mut [u32], right: &[u32]) -> usize {
#[inline]
fn intersection(left: &mut Vec<u32>, right: &[u32]) {
let mut left_index = 0;
let mut right_index = 0;
let mut count = 0;
@@ -128,42 +129,36 @@ fn intersection(left: &mut [u32], right: &[u32]) -> usize {
}
}
}
count
left.truncate(count);
}
/// Intersect twos sorted arrays `left` and `right` and outputs the
/// resulting array in left.
/// resulting array in left_positions if update_left is true.
///
/// Condition for match is that the value stored in left is less than or equal to
/// the value in right and that the distance to the previous token is lte to the slop.
/// Condition for match is that the distance between left and right is less than or equal to `slop`.
///
/// Returns the length of the intersection
fn intersection_with_slop(left: &mut [u32], right: &[u32], slop: u32) -> usize {
#[inline]
fn intersection_count_with_slop(
left_positions: &mut Vec<u32>,
right_positions: &[u32],
slop: u32,
update_left: bool,
) -> usize {
let mut left_index = 0;
let mut right_index = 0;
let mut count = 0;
let left_len = left.len();
let right_len = right.len();
let left_len = left_positions.len();
let right_len = right_positions.len();
while left_index < left_len && right_index < right_len {
let left_val = left[left_index];
let right_val = right[right_index];
let left_val = left_positions[left_index];
let right_val = right_positions[right_index];
// The three conditions are:
// left_val < right_slop -> left index increment.
// right_slop <= left_val <= right -> find the best match.
// left_val > right -> right index increment.
let right_slop = if right_val >= slop {
right_val - slop
} else {
0
};
if left_val < right_slop {
left_index += 1;
} else if right_slop <= left_val && left_val <= right_val {
let distance = left_val.abs_diff(right_val);
if distance <= slop {
while left_index + 1 < left_len {
// there could be a better match
let next_left_val = left[left_index + 1];
let next_left_val = left_positions[left_index + 1];
if next_left_val > right_val {
// the next value is outside the range, so current one is the best.
break;
@@ -171,72 +166,45 @@ fn intersection_with_slop(left: &mut [u32], right: &[u32], slop: u32) -> usize {
// the next value is better.
left_index += 1;
}
// store the match in left.
left[count] = right_val;
count += 1;
left_index += 1;
right_index += 1;
} else if left_val > right_val {
right_index += 1;
}
}
count
}
fn intersection_count_with_slop(left: &[u32], right: &[u32], slop: u32) -> usize {
let mut left_index = 0;
let mut right_index = 0;
let mut count = 0;
let left_len = left.len();
let right_len = right.len();
while left_index < left_len && right_index < right_len {
let left_val = left[left_index];
let right_val = right[right_index];
let right_slop = if right_val >= slop {
right_val - slop
} else {
0
};
if left_val < right_slop {
left_index += 1;
} else if right_slop <= left_val && left_val <= right_val {
while left_index + 1 < left_len {
let next_left_val = left[left_index + 1];
if next_left_val > right_val {
break;
}
left_index += 1;
if update_left {
left_positions[count] = right_val;
}
count += 1;
left_index += 1;
right_index += 1;
} else if left_val > right_val {
} else if left_val < right_val {
left_index += 1;
} else {
right_index += 1;
}
}
if update_left {
left_positions.truncate(count);
}
count
}
fn intersection_exists_with_slop(left: &[u32], right: &[u32], slop: u32) -> bool {
fn intersection_exists_with_slop(
left_positions: &[u32],
right_positions: &[u32],
slop: u32,
) -> bool {
let mut left_index = 0;
let mut right_index = 0;
let left_len = left.len();
let right_len = right.len();
let left_len = left_positions.len();
let right_len = right_positions.len();
while left_index < left_len && right_index < right_len {
let left_val = left[left_index];
let right_val = right[right_index];
let right_slop = if right_val >= slop {
right_val - slop
} else {
0
};
if left_val < right_slop {
left_index += 1;
} else if right_slop <= left_val && left_val <= right_val {
let left_val = left_positions[left_index];
let right_val = right_positions[right_index];
let distance = left_val.abs_diff(right_val);
if distance <= slop {
return true;
} else if left_val > right_val {
} else if left_val < right_val {
left_index += 1;
} else {
right_index += 1;
}
}
@@ -261,20 +229,20 @@ impl<TPostings: Postings> PhraseScorer<TPostings> {
}
pub(crate) fn new_with_offset(
term_postings: Vec<(usize, TPostings)>,
term_postings_with_offset: Vec<(usize, TPostings)>,
similarity_weight_opt: Option<Bm25Weight>,
fieldnorm_reader: FieldNormReader,
slop: u32,
offset: usize,
) -> PhraseScorer<TPostings> {
let max_offset = term_postings
let max_offset = term_postings_with_offset
.iter()
.map(|&(offset, _)| offset)
.max()
.unwrap_or(0)
+ offset;
let num_docsets = term_postings.len();
let postings_with_offsets = term_postings
let num_docsets = term_postings_with_offset.len();
let postings_with_offsets = term_postings_with_offset
.into_iter()
.map(|(offset, postings)| {
PostingsWithOffset::new(postings, (max_offset - offset) as u32)
@@ -283,8 +251,8 @@ impl<TPostings: Postings> PhraseScorer<TPostings> {
let mut scorer = PhraseScorer {
intersection_docset: Intersection::new(postings_with_offsets),
num_terms: num_docsets,
left: Vec::with_capacity(100),
right: Vec::with_capacity(100),
left_positions: Vec::with_capacity(100),
right_positions: Vec::with_capacity(100),
phrase_count: 0u32,
similarity_weight_opt,
fieldnorm_reader,
@@ -301,8 +269,8 @@ impl<TPostings: Postings> PhraseScorer<TPostings> {
}
pub(crate) fn get_intersection(&mut self) -> &[u32] {
let len = intersection(&mut self.left, &self.right);
&self.left[..len]
intersection(&mut self.left_positions, &self.right_positions);
&self.left_positions
}
fn phrase_match(&mut self) -> bool {
@@ -316,59 +284,61 @@ impl<TPostings: Postings> PhraseScorer<TPostings> {
}
fn phrase_exists(&mut self) -> bool {
let intersection_len = self.compute_phrase_match();
self.compute_phrase_match();
if self.has_slop() {
return intersection_exists_with_slop(
&self.left[..intersection_len],
&self.right[..],
intersection_exists_with_slop(
&self.left_positions,
&self.right_positions[..],
self.slop,
);
)
} else {
intersection_exists(&self.left_positions, &self.right_positions[..])
}
intersection_exists(&self.left[..intersection_len], &self.right[..])
}
fn compute_phrase_count(&mut self) -> u32 {
let intersection_len = self.compute_phrase_match();
self.compute_phrase_match();
if self.has_slop() {
return intersection_count_with_slop(
&self.left[..intersection_len],
&self.right[..],
intersection_count_with_slop(
&mut self.left_positions,
&self.right_positions[..],
self.slop,
) as u32;
false,
) as u32
} else {
intersection_count(&self.left_positions, &self.right_positions[..]) as u32
}
intersection_count(&self.left[..intersection_len], &self.right[..]) as u32
}
fn compute_phrase_match(&mut self) -> usize {
fn compute_phrase_match(&mut self) {
{
self.intersection_docset
.docset_mut_specialized(0)
.positions(&mut self.left);
.positions(&mut self.left_positions);
}
let mut intersection_len = self.left.len();
for i in 1..self.num_terms - 1 {
{
self.intersection_docset
.docset_mut_specialized(i)
.positions(&mut self.right);
.positions(&mut self.right_positions);
}
intersection_len = if self.has_slop() {
intersection_with_slop(
&mut self.left[..intersection_len],
&self.right[..],
if self.has_slop() {
intersection_count_with_slop(
&mut self.left_positions,
&self.right_positions[..],
self.slop,
)
true,
);
} else {
intersection(&mut self.left[..intersection_len], &self.right[..])
intersection(&mut self.left_positions, &self.right_positions);
};
if intersection_len == 0 {
return 0;
if self.left_positions.is_empty() {
return;
}
}
self.intersection_docset
.docset_mut_specialized(self.num_terms - 1)
.positions(&mut self.right);
intersection_len
.positions(&mut self.right_positions);
}
fn has_slop(&self) -> bool {
@@ -418,7 +388,7 @@ impl<TPostings: Postings> Scorer for PhraseScorer<TPostings> {
#[cfg(test)]
mod tests {
use super::{intersection, intersection_count, intersection_with_slop};
use super::{intersection, intersection_count, intersection_count_with_slop};
fn test_intersection_sym(left: &[u32], right: &[u32], expected: &[u32]) {
test_intersection_aux(left, right, expected, 0);
@@ -427,18 +397,16 @@ mod tests {
fn test_intersection_aux(left: &[u32], right: &[u32], expected: &[u32], slop: u32) {
let mut left_vec = Vec::from(left);
let left_mut = &mut left_vec[..];
if slop == 0 {
let left_mut = &mut left_vec[..];
assert_eq!(intersection_count(left_mut, right), expected.len());
let count = intersection(left_mut, right);
assert_eq!(&left_mut[..count], expected);
return;
assert_eq!(intersection_count(&left_vec, right), expected.len());
intersection(&mut left_vec, right);
assert_eq!(&left_vec, expected);
} else {
let mut right_vec = Vec::from(right);
let right_mut = &mut right_vec[..];
intersection_count_with_slop(&mut left_vec, right_mut, slop, true);
assert_eq!(&left_vec, expected);
}
let mut right_vec = Vec::from(right);
let right_mut = &mut right_vec[..];
let count = intersection_with_slop(left_mut, right_mut, slop);
assert_eq!(&left_mut[..count], expected);
}
#[test]
@@ -456,20 +424,19 @@ mod tests {
test_intersection_aux(&[1], &[3], &[], 1);
test_intersection_aux(&[1], &[3], &[3], 2);
test_intersection_aux(&[], &[2], &[], 100000);
test_intersection_aux(&[5, 7, 11], &[1, 5, 10, 12], &[5, 12], 1);
test_intersection_aux(&[1, 5, 6, 9, 10, 12], &[6, 8, 9, 12], &[6, 9, 12], 1);
test_intersection_aux(&[1, 5, 6, 9, 10, 12], &[6, 8, 9, 12], &[6, 9, 12], 10);
test_intersection_aux(&[5, 7, 11], &[1, 5, 10, 12], &[5, 10], 1);
test_intersection_aux(&[1, 5, 6, 9, 10, 12], &[6, 8, 9, 12], &[6, 8, 9, 12], 1);
test_intersection_aux(&[1, 5, 6, 9, 10, 12], &[6, 8, 9, 12], &[6, 8, 9, 12], 10);
test_intersection_aux(&[1, 3, 5], &[2, 4, 6], &[2, 4, 6], 1);
test_intersection_aux(&[1, 3, 5], &[2, 4, 6], &[], 0);
}
fn test_merge(left: &[u32], right: &[u32], expected_left: &[u32], slop: u32) {
let mut left_vec = Vec::from(left);
let left_mut = &mut left_vec[..];
let mut right_vec = Vec::from(right);
let right_mut = &mut right_vec[..];
let count = intersection_with_slop(left_mut, right_mut, slop);
assert_eq!(&left_mut[..count], expected_left);
intersection_count_with_slop(&mut left_vec, right_mut, slop, true);
assert_eq!(&left_vec, expected_left);
}
#[test]
@@ -477,7 +444,7 @@ mod tests {
test_merge(&[1, 2], &[1], &[1], 1);
test_merge(&[3], &[4], &[4], 2);
test_merge(&[3], &[4], &[4], 2);
test_merge(&[1, 5, 6, 9, 10, 12], &[6, 8, 9, 12], &[6, 9, 12], 10);
test_merge(&[1, 5, 6, 9, 10, 12], &[6, 8, 9, 12], &[6, 8, 9, 12], 10);
}
}
@@ -490,8 +457,10 @@ mod bench {
#[bench]
fn bench_intersection_short(b: &mut Bencher) {
let mut left = Vec::new();
b.iter(|| {
let mut left = [1, 5, 10, 12];
left.clear();
left.extend_from_slice(&[1, 5, 10, 12]);
let right = [5, 7];
intersection(&mut left, &right);
});