From c3e3715cbd947e5419dff64b3fa316d263cea5eb Mon Sep 17 00:00:00 2001 From: Paul Masurel Date: Sat, 13 Aug 2016 14:15:28 +0900 Subject: [PATCH] Can request for more or less functionality when opening a segment postings. --- src/core/merger.rs | 2 +- src/core/segment_reader.rs | 58 +++++++++++++++++++------ src/directory/mmap_directory.rs | 2 +- src/postings/freq_handler.rs | 28 +++++------- src/postings/mod.rs | 3 +- src/postings/segment_postings_option.rs | 5 +++ src/query/multi_term_query.rs | 3 +- 7 files changed, 67 insertions(+), 34 deletions(-) create mode 100644 src/postings/segment_postings_option.rs diff --git a/src/core/merger.rs b/src/core/merger.rs index acec73edd..7ba7c44d8 100644 --- a/src/core/merger.rs +++ b/src/core/merger.rs @@ -92,7 +92,7 @@ impl<'a> PostingsMerger<'a> { let offset = self.doc_offsets[heap_item.segment_ord]; let reader = &self.readers[heap_item.segment_ord]; - let segment_postings = reader.read_postings(&heap_item.term).unwrap(); + let segment_postings = reader.read_postings_all_info(&heap_item.term).unwrap(); let offset_postings = OffsetPostings::new(segment_postings, offset); segment_postings_list.push(offset_postings); } diff --git a/src/core/segment_reader.rs b/src/core/segment_reader.rs index f31f8e70d..68494f3ea 100644 --- a/src/core/segment_reader.rs +++ b/src/core/segment_reader.rs @@ -15,6 +15,7 @@ use std::fmt; use rustc_serialize::json; use core::index::SegmentInfo; use schema::Field; +use postings::SegmentPostingsOption; use postings::SegmentPostings; use fastfield::{U32FastFieldsReader, U32FastFieldReader}; use schema::FieldEntry; @@ -104,7 +105,9 @@ impl SegmentReader { let fieldnorms_data = try!(segment.open_read(SegmentComponent::FIELDNORMS)); let fieldnorms_reader = try!(U32FastFieldsReader::open(fieldnorms_data)); - let positions_data = try!(segment.open_read(SegmentComponent::POSITIONS)); + let positions_data = segment + .open_read(SegmentComponent::POSITIONS) + .unwrap_or(ReadOnlySource::Anonymous(Vec::new())); let schema = segment.schema(); Ok(SegmentReader { @@ -132,7 +135,11 @@ impl SegmentReader { self.store_reader.get(doc_id) } - pub fn read_postings(&self, term: &Term) -> Option { + + // TODO None is quite ambiguous here. + // is it because the term is not here, or because the + // field does not handle this functionality. + pub fn read_postings(&self, term: &Term, option: SegmentPostingsOption) -> Option { let field = term.get_field(); let field_entry = self.schema.get_field_entry(field); let term_info = get!(self.get_term_info(&term)); @@ -141,26 +148,51 @@ impl SegmentReader { let freq_handler = match field_entry { &FieldEntry::Text(_, ref options) => { let indexing_options = options.get_indexing_options(); - match indexing_options { - TextIndexingOptions::TokenizedWithFreq => { - FreqHandler::new_with_freq() - } - TextIndexingOptions::TokenizedWithFreqAndPosition => { - let offseted_position_data = &self.positions_data[term_info.positions_offset as usize ..]; - FreqHandler::new_with_freq_and_position(offseted_position_data) - } - _ => { + match option { + SegmentPostingsOption::NoFreq => { FreqHandler::new() } + SegmentPostingsOption::Freq => { + if indexing_options.is_termfreq_enabled() { + FreqHandler::new_with_freq() + } + else { + FreqHandler::new() + } + } + SegmentPostingsOption::FreqAndPositions => { + if indexing_options == TextIndexingOptions::TokenizedWithFreqAndPosition { + let offseted_position_data = &self.positions_data[term_info.positions_offset as usize ..]; + FreqHandler::new_with_freq_and_position(offseted_position_data) + } + else { + FreqHandler::new_with_freq() + } + } } } _ => { - panic!("Expected text field, got {:?}", field_entry); + FreqHandler::new() } }; Some(SegmentPostings::from_data(term_info.doc_freq, &postings_data, freq_handler)) } - + + pub fn read_postings_all_info(&self, term: &Term) -> Option { + let field_entry = self.schema.get_field_entry(term.get_field()); + let segment_posting_option = match field_entry { + &FieldEntry::Text(_, ref text_options) => { + match text_options.get_indexing_options() { + TextIndexingOptions::TokenizedWithFreq => SegmentPostingsOption::Freq, + TextIndexingOptions::TokenizedWithFreqAndPosition => SegmentPostingsOption::FreqAndPositions, + _ => SegmentPostingsOption::NoFreq, + } + } + &FieldEntry::U32(_, _) => SegmentPostingsOption::NoFreq + }; + self.read_postings(term, segment_posting_option) + } + pub fn get_term_info<'a>(&'a self, term: &Term) -> Option { self.term_infos.get(term.as_slice()) } diff --git a/src/directory/mmap_directory.rs b/src/directory/mmap_directory.rs index 373b437a0..ce427082c 100644 --- a/src/directory/mmap_directory.rs +++ b/src/directory/mmap_directory.rs @@ -75,7 +75,7 @@ impl Directory for MmapDirectory { let new_mmap = try!( MmapReadOnly::open_path(full_path.clone()) .map_err(|err| { - if err.kind() == io::ErrorKind::AlreadyExists { + if err.kind() == io::ErrorKind::NotFound { OpenError::FileDoesNotExist(PathBuf::from(&full_path)) } else { diff --git a/src/postings/freq_handler.rs b/src/postings/freq_handler.rs index 7832d99b9..dc80031a9 100644 --- a/src/postings/freq_handler.rs +++ b/src/postings/freq_handler.rs @@ -3,19 +3,13 @@ use std::io::Cursor; use common::VInt; use common::BinarySerializable; use compression::CompositeDecoder; +use postings::SegmentPostingsOption; use compression::NUM_DOCS_PER_BLOCK; - -enum Option { - NoFreq, - Freq, - FreqAndPositions, -} - pub struct FreqHandler { freq_decoder: SIMDBlockDecoder, positions: Vec, - option: Option, + option: SegmentPostingsOption, positions_offsets: [usize; NUM_DOCS_PER_BLOCK + 1], } @@ -38,7 +32,7 @@ impl FreqHandler { FreqHandler { freq_decoder: SIMDBlockDecoder::with_val(1u32), positions: Vec::new(), - option: Option::NoFreq, + option: SegmentPostingsOption::NoFreq, positions_offsets: [0; NUM_DOCS_PER_BLOCK + 1], } } @@ -47,7 +41,7 @@ impl FreqHandler { FreqHandler { freq_decoder: SIMDBlockDecoder::new(), positions: Vec::new(), - option: Option::Freq, + option: SegmentPostingsOption::Freq, positions_offsets: [0; NUM_DOCS_PER_BLOCK + 1], } } @@ -57,7 +51,7 @@ impl FreqHandler { FreqHandler { freq_decoder: SIMDBlockDecoder::new(), positions: positions, - option: Option::FreqAndPositions, + option: SegmentPostingsOption::FreqAndPositions, positions_offsets: [0; NUM_DOCS_PER_BLOCK + 1], } } @@ -88,13 +82,13 @@ impl FreqHandler { pub fn read_freq_block<'a>(&mut self, data: &'a [u8]) -> &'a [u8] { match self.option { - Option::NoFreq => { + SegmentPostingsOption::NoFreq => { data } - Option::Freq => { + SegmentPostingsOption::Freq => { self.freq_decoder.uncompress_block_unsorted(data) } - Option::FreqAndPositions => { + SegmentPostingsOption::FreqAndPositions => { let remaining: &'a [u8] = self.freq_decoder.uncompress_block_unsorted(data); self.fill_positions_offset(); remaining @@ -104,11 +98,11 @@ impl FreqHandler { pub fn read_freq_vint(&mut self, data: &[u8], num_els: usize) { match self.option { - Option::NoFreq => {} - Option::Freq => { + SegmentPostingsOption::NoFreq => {} + SegmentPostingsOption::Freq => { self.freq_decoder.uncompress_vint_unsorted(data, num_els); } - Option::FreqAndPositions => { + SegmentPostingsOption::FreqAndPositions => { self.freq_decoder.uncompress_vint_unsorted(data, num_els); self.fill_positions_offset(); } diff --git a/src/postings/mod.rs b/src/postings/mod.rs index 61811d260..a8b7bf506 100644 --- a/src/postings/mod.rs +++ b/src/postings/mod.rs @@ -12,6 +12,7 @@ mod freq_handler; mod union_postings; mod docset; mod scored_docset; +mod segment_postings_option; pub use self::docset::{SkipResult, DocSet}; pub use self::union_postings::UnionPostings; @@ -30,7 +31,7 @@ pub use self::intersection::IntersectionDocSet; pub use self::freq_handler::FreqHandler; pub use self::scored_docset::ScoredDocSet; pub use self::postings::HasLen; - +pub use self::segment_postings_option::SegmentPostingsOption; #[cfg(test)] mod tests { diff --git a/src/postings/segment_postings_option.rs b/src/postings/segment_postings_option.rs new file mode 100644 index 000000000..583e5748b --- /dev/null +++ b/src/postings/segment_postings_option.rs @@ -0,0 +1,5 @@ +pub enum SegmentPostingsOption { + NoFreq, + Freq, + FreqAndPositions, +} \ No newline at end of file diff --git a/src/query/multi_term_query.rs b/src/query/multi_term_query.rs index 8c8693265..ed2378561 100644 --- a/src/query/multi_term_query.rs +++ b/src/query/multi_term_query.rs @@ -20,6 +20,7 @@ use query::MultiTermAccumulator; use DocAddress; use query::Explanation; use query::occur::Occur; +use postings::SegmentPostingsOption; #[derive(Eq, PartialEq, Debug)] @@ -73,7 +74,7 @@ impl MultiTermQuery { let mut decode_timer = timer.open("decode_all"); for &(occur, ref term) in &self.occur_terms { let _decode_one_timer = decode_timer.open("decode_one"); - match reader.read_postings(&term) { + match reader.read_postings(&term, SegmentPostingsOption::Freq) { Some(postings) => { let field = term.get_field(); let fieldnorm_reader = try!(reader.get_fieldnorms_reader(field));