mirror of
https://github.com/quickwit-oss/tantivy.git
synced 2026-05-14 07:10:42 +00:00
A bug was added with the `seek_into_the_danger_zone()` optimization
(Spotted and fixed by Stu)
The contract says seek_into_the_danger_zone returns true if do is part of the docset.
The blanket implementation goes like this.
```
let current_doc = self.doc();
if current_doc < target {
self.seek(target);
}
self.doc() == target
```
So it will return true if target is TERMINATED, where really TERMINATED does not belong to the docset.
The fix tries to clarify the contracts and fixes the intersection algorithm.
We observe a small but all over the board improvement in intersection performance.
---------
Co-authored-by: Stu Hood <stuhood@gmail.com>
Co-authored-by: Paul Masurel <paul.masurel@datadoghq.com>
91 lines
2.6 KiB
Rust
91 lines
2.6 KiB
Rust
#![allow(dead_code)]
|
|
|
|
use common::HasLen;
|
|
|
|
use crate::docset::{DocSet, TERMINATED};
|
|
use crate::DocId;
|
|
|
|
/// Simulate a `Postings` objects from a `VecPostings`.
|
|
/// `VecPostings` only exist for testing purposes.
|
|
///
|
|
/// Term frequencies always return 1.
|
|
/// No positions are returned.
|
|
pub struct VecDocSet {
|
|
doc_ids: Vec<DocId>,
|
|
cursor: usize,
|
|
}
|
|
|
|
impl From<Vec<DocId>> for VecDocSet {
|
|
fn from(doc_ids: Vec<DocId>) -> VecDocSet {
|
|
// We do not use `slice::is_sorted`, as we want to check for doc ids to be strictly
|
|
// sorted.
|
|
assert!(doc_ids.windows(2).all(|w| w[0] < w[1]));
|
|
VecDocSet { doc_ids, cursor: 0 }
|
|
}
|
|
}
|
|
|
|
impl DocSet for VecDocSet {
|
|
fn advance(&mut self) -> DocId {
|
|
self.cursor += 1;
|
|
if self.cursor >= self.doc_ids.len() {
|
|
self.cursor = self.doc_ids.len();
|
|
return TERMINATED;
|
|
}
|
|
self.doc()
|
|
}
|
|
|
|
fn doc(&self) -> DocId {
|
|
if self.cursor == self.doc_ids.len() {
|
|
return TERMINATED;
|
|
}
|
|
self.doc_ids[self.cursor]
|
|
}
|
|
|
|
fn size_hint(&self) -> u32 {
|
|
self.len() as u32
|
|
}
|
|
}
|
|
|
|
impl HasLen for VecDocSet {
|
|
fn len(&self) -> usize {
|
|
self.doc_ids.len()
|
|
}
|
|
}
|
|
|
|
#[cfg(test)]
|
|
pub(crate) mod tests {
|
|
|
|
use super::*;
|
|
use crate::docset::COLLECT_BLOCK_BUFFER_LEN;
|
|
|
|
#[test]
|
|
pub fn test_vec_postings() {
|
|
let doc_ids: Vec<DocId> = (0u32..1024u32).map(|e| e * 3).collect();
|
|
let mut postings = VecDocSet::from(doc_ids);
|
|
assert_eq!(postings.doc(), 0u32);
|
|
assert_eq!(postings.advance(), 3u32);
|
|
assert_eq!(postings.doc(), 3u32);
|
|
assert_eq!(postings.seek(14u32), 15u32);
|
|
assert_eq!(postings.doc(), 15u32);
|
|
assert_eq!(postings.seek(300u32), 300u32);
|
|
assert_eq!(postings.doc(), 300u32);
|
|
assert_eq!(postings.seek(6000u32), TERMINATED);
|
|
}
|
|
|
|
#[test]
|
|
pub fn test_fill_buffer() {
|
|
let doc_ids: Vec<DocId> = (1u32..=(COLLECT_BLOCK_BUFFER_LEN as u32 * 2 + 9)).collect();
|
|
let mut postings = VecDocSet::from(doc_ids);
|
|
let mut buffer = [0u32; COLLECT_BLOCK_BUFFER_LEN];
|
|
assert_eq!(postings.fill_buffer(&mut buffer), COLLECT_BLOCK_BUFFER_LEN);
|
|
for i in 0u32..COLLECT_BLOCK_BUFFER_LEN as u32 {
|
|
assert_eq!(buffer[i as usize], i + 1);
|
|
}
|
|
assert_eq!(postings.fill_buffer(&mut buffer), COLLECT_BLOCK_BUFFER_LEN);
|
|
for i in 0u32..COLLECT_BLOCK_BUFFER_LEN as u32 {
|
|
assert_eq!(buffer[i as usize], i + 1 + COLLECT_BLOCK_BUFFER_LEN as u32);
|
|
}
|
|
assert_eq!(postings.fill_buffer(&mut buffer), 9);
|
|
}
|
|
}
|