mirror of
https://github.com/quickwit-oss/tantivy.git
synced 2026-01-09 02:22:54 +00:00
In sync with master. Fixed merging
This commit is contained in:
@@ -58,7 +58,7 @@ impl InvertedIndexReader {
|
||||
TermDictionaryImpl::empty(field_type),
|
||||
ReadOnlySource::empty(),
|
||||
ReadOnlySource::empty(),
|
||||
DeleteBitSet::empty(),
|
||||
None,
|
||||
record_option,
|
||||
)
|
||||
}
|
||||
|
||||
@@ -6,7 +6,6 @@ use core::SerializableSegment;
|
||||
use indexer::SegmentSerializer;
|
||||
use postings::InvertedIndexSerializer;
|
||||
use itertools::Itertools;
|
||||
use postings::Postings;
|
||||
use docset::DocSet;
|
||||
use fastfield::DeleteBitSet;
|
||||
use schema::{Field, Schema};
|
||||
@@ -18,6 +17,7 @@ use std::cmp::{max, min};
|
||||
use termdict::TermDictionary;
|
||||
use termdict::TermStreamer;
|
||||
use postings::DeleteSet;
|
||||
use postings::Postings;
|
||||
|
||||
pub struct IndexMerger {
|
||||
schema: Schema,
|
||||
@@ -206,6 +206,8 @@ impl IndexMerger {
|
||||
}
|
||||
|
||||
fn write_postings(&self, serializer: &mut InvertedIndexSerializer) -> Result<()> {
|
||||
|
||||
let mut positions_buffer: Vec<u32> = Vec::with_capacity(1_000);
|
||||
let mut delta_computer = DeltaComputer::new();
|
||||
|
||||
let mut indexed_fields = vec![];
|
||||
@@ -314,15 +316,15 @@ impl IndexMerger {
|
||||
{
|
||||
// we make sure to only write the term iff
|
||||
// there is at least one document.
|
||||
unreachable!();
|
||||
// let positions: &[u32] = segment_postings.positions();
|
||||
// let term_freq = segment_postings.term_freq();
|
||||
// let delta_positions = delta_computer.compute_delta(positions);
|
||||
// field_serializer.write_doc(
|
||||
// remapped_doc_id,
|
||||
// term_freq,
|
||||
// delta_positions,
|
||||
// )?;
|
||||
let term_freq = segment_postings.term_freq();
|
||||
segment_postings.positions(&mut positions_buffer);
|
||||
|
||||
let delta_positions = delta_computer.compute_delta(&positions_buffer);
|
||||
field_serializer.write_doc(
|
||||
remapped_doc_id,
|
||||
term_freq,
|
||||
delta_positions,
|
||||
)?;
|
||||
}
|
||||
if !segment_postings.advance() {
|
||||
break;
|
||||
|
||||
@@ -103,15 +103,18 @@ pub mod tests {
|
||||
let inverted_index = searcher.segment_reader(0u32).inverted_index(title);
|
||||
let term = Term::from_field_text(title, "abc");
|
||||
|
||||
let mut positions = Vec::new();
|
||||
|
||||
{
|
||||
let mut postings = inverted_index
|
||||
.read_postings(&term, IndexRecordOption::WithFreqsAndPositions)
|
||||
.unwrap();
|
||||
postings.advance();
|
||||
assert_eq!(&[0, 1, 2], postings.positions());
|
||||
postings.positions(&mut positions);
|
||||
assert_eq!(&[0, 1, 2], &positions[..]);
|
||||
postings.advance();
|
||||
assert_eq!(&[0, 5], postings.positions());
|
||||
postings.positions(&mut positions);
|
||||
assert_eq!(&[0, 5], &positions[..]);
|
||||
}
|
||||
{
|
||||
let mut postings = inverted_index
|
||||
@@ -119,7 +122,8 @@ pub mod tests {
|
||||
.unwrap();
|
||||
postings.advance();
|
||||
postings.advance();
|
||||
assert_eq!(&[0, 5], postings.positions());
|
||||
postings.positions(&mut positions);
|
||||
assert_eq!(&[0, 5], &positions[..]);
|
||||
}
|
||||
{
|
||||
|
||||
@@ -128,7 +132,8 @@ pub mod tests {
|
||||
.unwrap();
|
||||
assert_eq!(postings.skip_next(1), SkipResult::Reached);
|
||||
assert_eq!(postings.doc(), 1);
|
||||
assert_eq!(&[0, 5], postings.positions());
|
||||
postings.positions(&mut positions);
|
||||
assert_eq!(&[0, 5], &positions[..]);
|
||||
}
|
||||
{
|
||||
let mut postings = inverted_index
|
||||
@@ -136,7 +141,8 @@ pub mod tests {
|
||||
.unwrap();
|
||||
assert_eq!(postings.skip_next(1002), SkipResult::Reached);
|
||||
assert_eq!(postings.doc(), 1002);
|
||||
assert_eq!(&[0, 5], postings.positions());
|
||||
postings.positions(&mut positions);
|
||||
assert_eq!(&[0, 5], &positions[..]);
|
||||
}
|
||||
{
|
||||
let mut postings = inverted_index
|
||||
@@ -145,12 +151,14 @@ pub mod tests {
|
||||
assert_eq!(postings.skip_next(100), SkipResult::Reached);
|
||||
assert_eq!(postings.skip_next(1002), SkipResult::Reached);
|
||||
assert_eq!(postings.doc(), 1002);
|
||||
assert_eq!(&[0, 5], postings.positions());
|
||||
postings.positions(&mut positions);
|
||||
assert_eq!(&[0, 5], &positions[..]);
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
pub fn test_position_and_fieldnorm1() {
|
||||
let mut positions = Vec::new();
|
||||
let mut schema_builder = SchemaBuilder::default();
|
||||
let text_field = schema_builder.add_text_field("text", TEXT);
|
||||
let schema = schema_builder.build();
|
||||
@@ -223,15 +231,16 @@ pub mod tests {
|
||||
assert!(postings_a.advance());
|
||||
assert_eq!(postings_a.doc(), 0);
|
||||
assert_eq!(postings_a.term_freq(), 6);
|
||||
assert_eq!(postings_a.positions(), [0, 2, 4, 6, 7, 13]);
|
||||
assert_eq!(postings_a.positions(), [0, 2, 4, 6, 7, 13]);
|
||||
postings_a.positions(&mut positions);
|
||||
assert_eq!(&positions[..], [0, 2, 4, 6, 7, 13]);
|
||||
assert!(postings_a.advance());
|
||||
assert_eq!(postings_a.doc(), 1u32);
|
||||
assert_eq!(postings_a.term_freq(), 1);
|
||||
for i in 2u32..1000u32 {
|
||||
assert!(postings_a.advance());
|
||||
assert_eq!(postings_a.term_freq(), 1);
|
||||
assert_eq!(postings_a.positions(), [i]);
|
||||
postings_a.positions(&mut positions);
|
||||
assert_eq!(&positions[..], [i]);
|
||||
assert_eq!(postings_a.doc(), i);
|
||||
}
|
||||
assert!(!postings_a.advance());
|
||||
@@ -246,7 +255,7 @@ pub mod tests {
|
||||
for i in 2u32..1000u32 {
|
||||
assert!(postings_e.advance());
|
||||
assert_eq!(postings_e.term_freq(), i);
|
||||
let positions = postings_e.positions();
|
||||
postings_e.positions(&mut positions);
|
||||
assert_eq!(positions.len(), i as usize);
|
||||
for j in 0..positions.len() {
|
||||
assert_eq!(positions[j], (j as u32));
|
||||
@@ -260,6 +269,7 @@ pub mod tests {
|
||||
|
||||
#[test]
|
||||
pub fn test_position_and_fieldnorm2() {
|
||||
let mut positions: Vec<u32> = Vec::new();
|
||||
let mut schema_builder = SchemaBuilder::default();
|
||||
let text_field = schema_builder.add_text_field("text", TEXT);
|
||||
let schema = schema_builder.build();
|
||||
@@ -288,7 +298,8 @@ pub mod tests {
|
||||
.unwrap();
|
||||
assert!(postings.advance());
|
||||
assert_eq!(postings.doc(), 1u32);
|
||||
assert_eq!(postings.positions(), &[1u32, 4]);
|
||||
postings.positions(&mut positions);
|
||||
assert_eq!(&positions[..], &[1u32, 4]);
|
||||
}
|
||||
|
||||
#[test]
|
||||
|
||||
@@ -17,4 +17,8 @@ pub trait Postings: DocSet + 'static {
|
||||
/// Returns the list of positions of the term, expressed as a list of
|
||||
/// token ordinals.
|
||||
fn positions_with_offset(&mut self, offset: u32, output: &mut Vec<u32>);
|
||||
|
||||
fn positions(&mut self, output: &mut Vec<u32>) {
|
||||
self.positions_with_offset(0u32, output);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -9,13 +9,10 @@ use std::cmp;
|
||||
use fst::Streamer;
|
||||
use compression::compressed_block_size;
|
||||
use postings::{NoDelete, DeleteSet};
|
||||
use std::cell::UnsafeCell;
|
||||
use directory::{ReadOnlySource, SourceRead};
|
||||
use postings::FreqReadingOption;
|
||||
use postings::serializer::PostingsSerializer;
|
||||
|
||||
const EMPTY_POSITIONS: [u32; 0] = [0u32; 0];
|
||||
|
||||
struct PositionComputer {
|
||||
// store the amount of position int
|
||||
// before reading positions.
|
||||
@@ -41,8 +38,7 @@ impl PositionComputer {
|
||||
}
|
||||
|
||||
// Positions can only be read once.
|
||||
pub fn positions(&mut self, offset: u32, output: &mut [u32]) {
|
||||
let term_freq = output.len();
|
||||
pub fn positions_with_offset(&mut self, offset: u32, output: &mut [u32]) {
|
||||
if let Some(num_skip) = self.position_to_skip {
|
||||
self.positions_stream.skip(num_skip);
|
||||
self.positions_stream.read(output);
|
||||
@@ -183,7 +179,7 @@ impl<TDeleteSet: DeleteSet> DocSet for SegmentPostings<TDeleteSet> {
|
||||
// add the term freq.
|
||||
if self.position_computer.is_some() {
|
||||
let freqs_skipped = &self.block_cursor.freqs()[self.cur..];
|
||||
let sum_freq: u32 = freqs_skipped.iter().sum()
|
||||
let sum_freq: u32 = freqs_skipped.iter().sum();
|
||||
self.position_computer.as_mut()
|
||||
.unwrap()
|
||||
.add_skip(sum_freq as usize);
|
||||
@@ -319,10 +315,10 @@ impl<TDeleteSet: DeleteSet> Postings for SegmentPostings<TDeleteSet> {
|
||||
}
|
||||
unsafe {
|
||||
output.set_len(term_freq);
|
||||
self.position_computer.as_mut().unwrap().positions(offset, &mut output[..])
|
||||
self.position_computer.as_mut().unwrap().positions_with_offset(offset, &mut output[..])
|
||||
}
|
||||
} else {
|
||||
unimplemented!("You may not read positions twice!");
|
||||
output.clear();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -2,7 +2,6 @@ use DocId;
|
||||
use docset::{DocSet, SkipResult};
|
||||
use postings::Postings;
|
||||
use query::{Intersection, Scorer};
|
||||
use std::mem;
|
||||
|
||||
|
||||
struct PostingsWithOffset<TPostings> {
|
||||
@@ -48,7 +47,7 @@ pub struct PhraseScorer<TPostings: Postings> {
|
||||
right: Vec<u32>
|
||||
}
|
||||
|
||||
fn intersection_arr(left: &mut [u32], right: &[u32]) -> usize {
|
||||
fn intersection_count(left: &[u32], right: &[u32]) -> usize {
|
||||
let mut left_i = 0;
|
||||
let mut right_i = 0;
|
||||
let mut count = 0;
|
||||
@@ -58,7 +57,6 @@ fn intersection_arr(left: &mut [u32], right: &[u32]) -> usize {
|
||||
} else if right[right_i] < left[left_i] {
|
||||
right_i += 1;
|
||||
} else {
|
||||
left[count] = left[left_i];
|
||||
count+=1;
|
||||
left_i += 1;
|
||||
right_i += 1;
|
||||
@@ -95,7 +93,7 @@ impl<TPostings: Postings> PhraseScorer<TPostings> {
|
||||
{
|
||||
self.intersection_docset.docset_mut_specialized(i).positions(&mut self.right);
|
||||
}
|
||||
intersection_len = intersection_arr(&mut self.left[..intersection_len], &self.right[..]);
|
||||
intersection_len = intersection_count(&mut self.left[..intersection_len], &self.right[..]);
|
||||
if intersection_len == 0 {
|
||||
return false;
|
||||
}
|
||||
@@ -152,25 +150,14 @@ mod tests {
|
||||
|
||||
use tests;
|
||||
use test::Bencher;
|
||||
use super::{intersection_arr, intersection_avx};
|
||||
use super::intersection_count;
|
||||
|
||||
#[bench]
|
||||
fn bench_intersection(b: &mut Bencher) {
|
||||
let left = tests::sample_with_seed(100_000, 0.1, 1);
|
||||
let right = tests::sample_with_seed(200_000, 0.05, 2);
|
||||
let mut output = vec![0u32; 200_000];
|
||||
let left = tests::sample_with_seed(10, 0.1, 1);
|
||||
let right = tests::sample_with_seed(2, 0.05, 2);
|
||||
b.iter(|| {
|
||||
intersection_arr(&left, &right, &mut output);
|
||||
});
|
||||
}
|
||||
|
||||
#[bench]
|
||||
fn bench_intersection_avx(b: &mut Bencher) {
|
||||
let left = tests::sample_with_seed(100_000, 0.1, 1);
|
||||
let right = tests::sample_with_seed(200_000, 0.05, 2);
|
||||
let mut output = vec![0u32; 200_000];
|
||||
b.iter(|| {
|
||||
intersection_avx(&left, &right, &mut output);
|
||||
intersection_count(&left, &right);
|
||||
});
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user