From 654769bb60e4fd1ceeafeefeefb895cb560e42a9 Mon Sep 17 00:00:00 2001 From: Paul Masurel Date: Sat, 13 Aug 2016 15:59:12 +0900 Subject: [PATCH] merging positions. --- src/core/merger.rs | 33 ++++++++++++++++++++++++++++++--- src/postings/postings.rs | 9 ++++++++- 2 files changed, 38 insertions(+), 4 deletions(-) diff --git a/src/core/merger.rs b/src/core/merger.rs index 7ba7c44d8..69bdc3243 100644 --- a/src/core/merger.rs +++ b/src/core/merger.rs @@ -18,6 +18,7 @@ use postings::ChainedPostings; use postings::HasLen; use postings::OffsetPostings; use core::index::SegmentInfo; +use compression::NUM_DOCS_PER_BLOCK; use std::cmp::{min, max, Ordering}; @@ -121,14 +122,38 @@ impl<'a> PostingsMerger<'a> { } } -const EMPTY_ARRAY: [u32; 0] = []; - pub struct IndexMerger { schema: Schema, readers: Vec, segment_info: SegmentInfo, } + +struct DeltaPositionComputer { + buffer: [u32; NUM_DOCS_PER_BLOCK] +} + +impl DeltaPositionComputer { + fn new() -> DeltaPositionComputer { + DeltaPositionComputer { + buffer: [0u32; NUM_DOCS_PER_BLOCK] + } + } + + fn compute_delta_positions(&mut self, positions: &[u32],) -> &[u32] { + let mut last_pos = 0u32; + let num_positions = positions.len(); + for i in 0..num_positions { + let position = positions[i]; + self.buffer[i] = position - last_pos; + last_pos = position; + } + &self.buffer[..num_positions] + } +} + + + impl IndexMerger { pub fn open(schema: Schema, segments: &Vec) -> Result { let mut readers = Vec::new(); @@ -205,12 +230,14 @@ impl IndexMerger { fn write_postings(&self, postings_serializer: &mut PostingsSerializer) -> Result<()> { let mut postings_merger = PostingsMerger::new(&self.readers); + let mut delta_position_computer = DeltaPositionComputer::new(); loop { match postings_merger.next() { Some((term, mut merged_doc_ids)) => { try!(postings_serializer.new_term(&term, merged_doc_ids.len() as DocId)); while merged_doc_ids.advance() { - try!(postings_serializer.write_doc(merged_doc_ids.doc(), merged_doc_ids.term_freq(), &EMPTY_ARRAY)); + let delta_positions: &[u32] = delta_position_computer.compute_delta_positions(merged_doc_ids.positions()); + try!(postings_serializer.write_doc(merged_doc_ids.doc(), merged_doc_ids.term_freq(), delta_positions)); } try!(postings_serializer.close_term()); } diff --git a/src/postings/postings.rs b/src/postings/postings.rs index b85a15fe8..e18894d83 100644 --- a/src/postings/postings.rs +++ b/src/postings/postings.rs @@ -4,7 +4,14 @@ use postings::docset::DocSet; -// Postings trait includes all the infomration +// Postings trait defines all of the information +// associated with a term. +// +// List of docids, term freqs and positions. +// +// It's main implementation is SegmentPostings, +// but some other implementation mocking SegmentPostings exists, +// in order to help merging segment or for testing. pub trait Postings: DocSet { fn term_freq(&self,) -> u32; fn positions(&self) -> &[u32];