From 236fa74767c8dd0feb3bb2a4d01c346eb6dafdbb Mon Sep 17 00:00:00 2001 From: Paul Masurel Date: Sat, 5 Aug 2017 23:17:35 +0900 Subject: [PATCH] Positions almost working. --- src/core/segment_reader.rs | 5 +- src/postings/mod.rs | 13 ++-- src/postings/segment_postings.rs | 101 +++++++++++++++++++++++++--- src/query/term_query/term_weight.rs | 6 +- 4 files changed, 103 insertions(+), 22 deletions(-) diff --git a/src/core/segment_reader.rs b/src/core/segment_reader.rs index ff66273e2..56247e942 100644 --- a/src/core/segment_reader.rs +++ b/src/core/segment_reader.rs @@ -24,7 +24,6 @@ use postings::SegmentPostingsOption; use postings::{SegmentPostings, BlockSegmentPostings}; use fastfield::{FastFieldsReader, FastFieldReader, U64FastFieldReader}; use schema::Schema; -use postings::FreqHandler; @@ -198,10 +197,10 @@ impl SegmentReader { /// For instance, requesting `SegmentPostingsOption::FreqAndPositions` for a /// `TextIndexingOptions` that does not index position will return a `SegmentPostings` /// with `DocId`s and frequencies. - pub fn read_postings(&self, + pub fn read_postings<'a>(&'a self, term: &Term, option: SegmentPostingsOption) - -> Option { + -> Option> { let field = term.field(); let field_entry = self.schema.get_field_entry(field); let term_info = get!(self.get_term_info(term)); diff --git a/src/postings/mod.rs b/src/postings/mod.rs index ed8a6998f..21cfa6777 100644 --- a/src/postings/mod.rs +++ b/src/postings/mod.rs @@ -12,7 +12,6 @@ mod term_info; mod vec_postings; mod segment_postings; mod intersection; -mod freq_handler; mod docset; mod segment_postings_option; @@ -28,7 +27,6 @@ pub use self::vec_postings::VecPostings; pub use self::segment_postings::{SegmentPostings, BlockSegmentPostings}; pub use self::intersection::IntersectionDocSet; -pub use self::freq_handler::FreqHandler; pub use self::segment_postings_option::SegmentPostingsOption; pub use common::HasLen; @@ -63,18 +61,18 @@ mod tests { let mut posting_serializer = PostingsSerializer::open(&mut segment).unwrap(); posting_serializer.new_field(text_field); posting_serializer.new_term("abc".as_bytes()).unwrap(); - for doc_id in 0u32..3u32 { - let positions = vec![1, 2, 3, 2]; - posting_serializer.write_doc(doc_id, 2, &positions).unwrap(); + for doc_id in 0u32..120u32 { + let delta_positions = vec![1, 2, 3, 2]; + posting_serializer.write_doc(doc_id, 2, &delta_positions).unwrap(); } posting_serializer.close_term().unwrap(); posting_serializer.close().unwrap(); let read = segment.open_read(SegmentComponent::POSITIONS).unwrap(); - assert!(read.len() <= 16); + assert!(read.len() <= 140); } #[test] - pub fn test_position_and_fieldnorm() { + pub fn test_position_and_fieldnorm1() { let mut schema_builder = SchemaBuilder::default(); let text_field = schema_builder.add_text_field("text", TEXT); let schema = schema_builder.build(); @@ -144,6 +142,7 @@ mod tests { assert_eq!(postings_a.doc(), 0); assert_eq!(postings_a.term_freq(), 6); assert_eq!(postings_a.positions(), [0, 2, 4, 6, 7, 13]); + assert_eq!(postings_a.positions(), [0, 2, 4, 6, 7, 13]); assert!(postings_a.advance()); assert_eq!(postings_a.doc(), 1u32); assert_eq!(postings_a.term_freq(), 1); diff --git a/src/postings/segment_postings.rs b/src/postings/segment_postings.rs index 553c50f70..ab4805d5e 100644 --- a/src/postings/segment_postings.rs +++ b/src/postings/segment_postings.rs @@ -4,9 +4,65 @@ use postings::{Postings, DocSet, HasLen, SkipResult}; use std::cmp; use fst::Streamer; use fastfield::DeleteBitSet; - +use std::cell::UnsafeCell; const EMPTY_DATA: [u8; 0] = [0u8; 0]; +const EMPTY_POSITIONS: [u32; 0] = [0u32; 0]; + +struct PositionComputer<'a> { + // store the amount of position int + // before reading positions. + // + // if none, position are already loaded in + // the positions vec. + position_to_skip: Option, + + delta_positions: Vec, + positions: Vec, + positions_stream: CompressedIntStream<'a>, +} + +impl<'a> PositionComputer<'a> { + + pub fn new(positions_stream: CompressedIntStream<'a>) -> PositionComputer<'a> { + PositionComputer { + position_to_skip: None, + positions: vec!(), + delta_positions: vec!(), + positions_stream: positions_stream, + } + } + + pub fn add_skip(&mut self, num_skip: usize) { + self.position_to_skip = Some( + self.position_to_skip + .map(|prev_skip| prev_skip + num_skip) + .unwrap_or(0) + ); + } + + pub fn positions(&mut self, term_freq: usize) -> &[u32] { + self.delta_positions(term_freq); + &self.positions[..term_freq] + } + + pub fn delta_positions(&mut self, term_freq: usize) -> &[u32] { + if let Some(num_skip) = self.position_to_skip { + self.delta_positions.resize(term_freq, 0u32); + self.positions_stream.skip(num_skip); + self.positions_stream.read(&mut self.delta_positions[..term_freq]); + self.positions.resize(term_freq, 0u32); + let mut cum = 0u32; + for i in 0..term_freq as usize { + cum += self.delta_positions[i]; + self.positions[i] = cum; + } + self.position_to_skip = None; + } + &self.delta_positions[..term_freq] + } +} + /// `SegmentPostings` represents the inverted list or postings associated to @@ -18,9 +74,11 @@ pub struct SegmentPostings<'a> { block_cursor: BlockSegmentPostings<'a>, cur: usize, delete_bitset: DeleteBitSet, - positions_stream: Option>, + + position_computer: Option>>, } + impl<'a> SegmentPostings<'a> { /// Reads a Segment postings from an &[u8] /// @@ -30,24 +88,27 @@ impl<'a> SegmentPostings<'a> { /// frequencies and/or positions pub fn from_block_postings(segment_block_postings: BlockSegmentPostings<'a>, delete_bitset: DeleteBitSet, - positions_stream: Option>) + positions_stream_opt: Option>) -> SegmentPostings<'a> { + let position_computer = positions_stream_opt.map(|stream| { + UnsafeCell::new(PositionComputer::new(stream)) + }); SegmentPostings { block_cursor: segment_block_postings, cur: NUM_DOCS_PER_BLOCK, // cursor within the block delete_bitset: delete_bitset, - positions_stream: positions_stream + position_computer: position_computer, } } /// Returns an empty segment postings object - pub fn empty() -> SegmentPostings<'static> { + pub fn empty() -> SegmentPostings<'a> { let empty_block_cursor = BlockSegmentPostings::empty(); SegmentPostings { block_cursor: empty_block_cursor, delete_bitset: DeleteBitSet::empty(), cur: NUM_DOCS_PER_BLOCK, - positions_stream: None, + position_computer: None, } } } @@ -58,7 +119,9 @@ impl<'a> DocSet for SegmentPostings<'a> { // next needs to be called a first time to point to the correct element. #[inline] fn advance(&mut self) -> bool { + let mut pos_to_skip = 0u32; loop { + pos_to_skip += self.term_freq(); self.cur += 1; if self.cur >= self.block_cursor.block_len() { self.cur = 0; @@ -68,6 +131,11 @@ impl<'a> DocSet for SegmentPostings<'a> { } } if !self.delete_bitset.is_deleted(self.doc()) { + if let Some(ref mut position_computer) = self.position_computer.as_mut() { + unsafe { + (*position_computer.get()).add_skip(pos_to_skip as usize); + } + } return true; } } @@ -181,11 +249,26 @@ impl<'a> Postings for SegmentPostings<'a> { } fn positions(&self) -> &[u32] { - unimplemented!(); + let term_freq = self.term_freq(); + let position_computer_ptr: *mut PositionComputer = self.position_computer + .as_ref() + .expect("Segment reader does not have positions.") + .get(); + unsafe { + (&mut *position_computer_ptr).positions(term_freq as usize) + } } fn delta_positions(&self) -> &[u32] { - unimplemented!(); + let term_freq = self.term_freq(); + self.position_computer + .as_ref() + .map(|position_computer| { + unsafe { + (&mut *position_computer.get()).delta_positions(term_freq as usize) + } + }) + .unwrap_or(&EMPTY_POSITIONS[..]) } } @@ -333,7 +416,7 @@ impl<'a> BlockSegmentPostings<'a> { num_vint_docs: 0, doc_decoder: BlockDecoder::new(), - freq_decoder: BlockDecoder::new(), + freq_decoder: BlockDecoder::with_val(1), has_freq: false, remaining_data: &EMPTY_DATA, diff --git a/src/query/term_query/term_weight.rs b/src/query/term_query/term_weight.rs index e781ebdbd..99bfa7d47 100644 --- a/src/query/term_query/term_weight.rs +++ b/src/query/term_query/term_weight.rs @@ -27,13 +27,13 @@ impl TermWeight { 1.0 + (self.num_docs as f32 / (self.doc_freq as f32 + 1.0)).ln() } - pub fn specialized_scorer<'a>(&'a self, + pub fn specialized_scorer<'a>(&self, reader: &'a SegmentReader) -> Result>> { let field = self.term.field(); let fieldnorm_reader_opt = reader.get_fieldnorms_reader(field); - Ok(reader - .read_postings(&self.term, self.segment_postings_options) + let postings: Option> = reader.read_postings(&self.term, self.segment_postings_options); + Ok(postings .map(|segment_postings| { TermScorer { idf: self.idf(),