mirror of
https://github.com/quickwit-oss/tantivy.git
synced 2026-01-08 01:52:54 +00:00
Positions writes to an external Vec
This commit is contained in:
@@ -314,14 +314,15 @@ impl IndexMerger {
|
||||
{
|
||||
// we make sure to only write the term iff
|
||||
// there is at least one document.
|
||||
let positions: &[u32] = segment_postings.positions();
|
||||
let term_freq = segment_postings.term_freq();
|
||||
let delta_positions = delta_computer.compute_delta(positions);
|
||||
field_serializer.write_doc(
|
||||
remapped_doc_id,
|
||||
term_freq,
|
||||
delta_positions,
|
||||
)?;
|
||||
unreachable!();
|
||||
// let positions: &[u32] = segment_postings.positions();
|
||||
// let term_freq = segment_postings.term_freq();
|
||||
// let delta_positions = delta_computer.compute_delta(positions);
|
||||
// field_serializer.write_doc(
|
||||
// remapped_doc_id,
|
||||
// term_freq,
|
||||
// delta_positions,
|
||||
// )?;
|
||||
}
|
||||
if !segment_postings.advance() {
|
||||
break;
|
||||
|
||||
@@ -16,9 +16,5 @@ pub trait Postings: DocSet + 'static {
|
||||
|
||||
/// Returns the list of positions of the term, expressed as a list of
|
||||
/// token ordinals.
|
||||
fn positions_with_offset(&self, offset: u32) -> &[u32];
|
||||
|
||||
fn positions(&self) -> &[u32] {
|
||||
self.positions_with_offset(0u32)
|
||||
}
|
||||
fn positions_with_offset(&self, offset: u32, output: &mut Vec<u32>);
|
||||
}
|
||||
|
||||
@@ -23,7 +23,6 @@ struct PositionComputer {
|
||||
// if none, position are already loaded in
|
||||
// the positions vec.
|
||||
position_to_skip: Option<usize>,
|
||||
positions: Vec<u32>,
|
||||
positions_stream: CompressedIntStream,
|
||||
}
|
||||
|
||||
@@ -31,7 +30,6 @@ impl PositionComputer {
|
||||
pub fn new(positions_stream: CompressedIntStream) -> PositionComputer {
|
||||
PositionComputer {
|
||||
position_to_skip: None,
|
||||
positions: vec![],
|
||||
positions_stream,
|
||||
}
|
||||
}
|
||||
@@ -44,26 +42,19 @@ impl PositionComputer {
|
||||
);
|
||||
}
|
||||
|
||||
pub fn positions(&mut self, term_freq: usize, offset: u32) -> &[u32] {
|
||||
pub fn positions(&mut self, offset: u32, output: &mut [u32]) {
|
||||
let term_freq = output.len();
|
||||
if let Some(num_skip) = self.position_to_skip {
|
||||
let capacity = self.positions.capacity();
|
||||
if capacity < term_freq {
|
||||
let extra_capacity = term_freq - self.positions.len();
|
||||
self.positions.reserve(extra_capacity);
|
||||
}
|
||||
unsafe {self.positions.set_len(term_freq)};
|
||||
self.positions_stream.skip(num_skip);
|
||||
let positions_buf = &mut self.positions[..term_freq];
|
||||
self.positions_stream.read(positions_buf);
|
||||
self.positions_stream.read(output);
|
||||
self.position_to_skip = None;
|
||||
let mut cum = offset;
|
||||
for position_mut in positions_buf.iter_mut() {
|
||||
cum += *position_mut;
|
||||
*position_mut = cum;
|
||||
for output_mut in output.iter_mut() {
|
||||
cum += *output_mut;
|
||||
*output_mut = cum;
|
||||
}
|
||||
positions_buf
|
||||
} else {
|
||||
&self.positions[..term_freq]
|
||||
panic!("Failed positions");
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -321,14 +312,21 @@ impl<TDeleteSet: DeleteSet> Postings for SegmentPostings<TDeleteSet> {
|
||||
self.block_cursor.freq(self.cur)
|
||||
}
|
||||
|
||||
fn positions_with_offset(&self, offset: u32) -> &[u32] {
|
||||
let term_freq = self.term_freq();
|
||||
self.position_computer
|
||||
.as_ref()
|
||||
.map(|position_computer| unsafe {
|
||||
(&mut *position_computer.get()).positions(term_freq as usize, offset)
|
||||
})
|
||||
.unwrap_or(&EMPTY_POSITIONS[..])
|
||||
fn positions_with_offset(&self, offset: u32, output: &mut Vec<u32>) {
|
||||
if let Some(ref position_computer) = self.position_computer.as_ref() {
|
||||
let prev_capacity = output.capacity();
|
||||
let term_freq = self.term_freq() as usize;
|
||||
if term_freq > prev_capacity {
|
||||
let additional_len = term_freq - output.len();
|
||||
output.reserve(additional_len);
|
||||
}
|
||||
unsafe {
|
||||
output.set_len(term_freq);
|
||||
(&mut *position_computer.get()).positions(offset, &mut output[..])
|
||||
}
|
||||
} else {
|
||||
unimplemented!("You may not read positions twice!");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -18,8 +18,8 @@ impl<TPostings: Postings> PostingsWithOffset<TPostings> {
|
||||
}
|
||||
}
|
||||
|
||||
pub fn positions(&self) -> &[u32] {
|
||||
self.postings.positions_with_offset(self.offset)
|
||||
pub fn positions(&self, output: &mut Vec<u32>) {
|
||||
self.postings.positions_with_offset(self.offset, output)
|
||||
}
|
||||
}
|
||||
|
||||
@@ -44,11 +44,11 @@ impl<TPostings: Postings> DocSet for PostingsWithOffset<TPostings> {
|
||||
pub struct PhraseScorer<TPostings: Postings> {
|
||||
intersection_docset: Intersection<PostingsWithOffset<TPostings>, PostingsWithOffset<TPostings>>,
|
||||
num_docsets: usize,
|
||||
source: Vec<u32>,
|
||||
result: Vec<u32>
|
||||
left: Vec<u32>,
|
||||
right: Vec<u32>
|
||||
}
|
||||
|
||||
fn intersection_arr(left: &[u32], right: &[u32], output: &mut [u32]) -> usize {
|
||||
fn intersection_arr(left: &mut [u32], right: &[u32]) -> usize {
|
||||
let mut left_i = 0;
|
||||
let mut right_i = 0;
|
||||
let mut count = 0;
|
||||
@@ -58,7 +58,7 @@ fn intersection_arr(left: &[u32], right: &[u32], output: &mut [u32]) -> usize {
|
||||
} else if right[right_i] < left[left_i] {
|
||||
right_i += 1;
|
||||
} else {
|
||||
output[count] = left[left_i];
|
||||
left[count] = left[left_i];
|
||||
count+=1;
|
||||
left_i += 1;
|
||||
right_i += 1;
|
||||
@@ -67,6 +67,7 @@ fn intersection_arr(left: &[u32], right: &[u32], output: &mut [u32]) -> usize {
|
||||
count
|
||||
}
|
||||
|
||||
|
||||
impl<TPostings: Postings> PhraseScorer<TPostings> {
|
||||
|
||||
pub fn new(term_postings: Vec<TPostings>) -> PhraseScorer<TPostings> {
|
||||
@@ -79,34 +80,18 @@ impl<TPostings: Postings> PhraseScorer<TPostings> {
|
||||
PhraseScorer {
|
||||
intersection_docset: Intersection::new(postings_with_offsets),
|
||||
num_docsets,
|
||||
source: Vec::with_capacity(100),
|
||||
result: Vec::with_capacity(100)
|
||||
left: Vec::with_capacity(100),
|
||||
right: Vec::with_capacity(100)
|
||||
}
|
||||
}
|
||||
|
||||
fn phrase_match(&mut self) -> bool {
|
||||
// TODO early exit when we don't care about th phrase frequency
|
||||
let mut intersection_len;
|
||||
{
|
||||
let left = self.intersection_docset.docset(0).positions();
|
||||
let right = self.intersection_docset.docset(1).positions();
|
||||
let max_intersection_len = left.len().min(right.len());
|
||||
if max_intersection_len > self.result.len() {
|
||||
self.result.resize(max_intersection_len, 0u32);
|
||||
self.source.resize(max_intersection_len, 0u32)
|
||||
}
|
||||
intersection_len = intersection_arr(left, right, &mut self.result[..]);
|
||||
}
|
||||
if intersection_len == 0 {
|
||||
return false;
|
||||
}
|
||||
for i in 2..self.num_docsets {
|
||||
mem::swap(&mut self.source, &mut self.result);
|
||||
let term_positions = self.intersection_docset.docset(i).positions();
|
||||
intersection_len = intersection_arr(
|
||||
&self.source[..intersection_len],
|
||||
term_positions,
|
||||
&mut self.result[..]);
|
||||
self.intersection_docset.docset(0).positions(&mut self.left);
|
||||
let mut intersection_len = self.left.len();
|
||||
for i in 1..self.num_docsets {
|
||||
self.intersection_docset.docset(i).positions(&mut self.right);
|
||||
intersection_len = intersection_arr(&mut self.left[..intersection_len], &self.right[..]);
|
||||
if intersection_len == 0 {
|
||||
return false;
|
||||
}
|
||||
@@ -157,3 +142,31 @@ impl<TPostings: Postings> Scorer for PhraseScorer<TPostings> {
|
||||
1f32
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
|
||||
use tests;
|
||||
use test::Bencher;
|
||||
use super::{intersection_arr, intersection_avx};
|
||||
|
||||
#[bench]
|
||||
fn bench_intersection(b: &mut Bencher) {
|
||||
let left = tests::sample_with_seed(100_000, 0.1, 1);
|
||||
let right = tests::sample_with_seed(200_000, 0.05, 2);
|
||||
let mut output = vec![0u32; 200_000];
|
||||
b.iter(|| {
|
||||
intersection_arr(&left, &right, &mut output);
|
||||
});
|
||||
}
|
||||
|
||||
#[bench]
|
||||
fn bench_intersection_avx(b: &mut Bencher) {
|
||||
let left = tests::sample_with_seed(100_000, 0.1, 1);
|
||||
let right = tests::sample_with_seed(200_000, 0.05, 2);
|
||||
let mut output = vec![0u32; 200_000];
|
||||
b.iter(|| {
|
||||
intersection_avx(&left, &right, &mut output);
|
||||
});
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user