From cdcc72a0c90b89d1a3f85225bebf75568a8ce70b Mon Sep 17 00:00:00 2001 From: Paul Masurel Date: Thu, 19 May 2016 11:08:50 +0900 Subject: [PATCH] test passing --- TODO.md | 1 + src/core/index.rs | 4 ++ src/core/merger.rs | 7 +-- src/core/writer.rs | 2 +- src/lib.rs | 7 +-- src/postings/freq_handler.rs | 6 ++- src/postings/segment_postings.rs | 2 +- src/postings/serializer.rs | 38 ++++++++++---- src/schema/mod.rs | 3 +- src/schema/schema.rs | 17 +++--- src/schema/term.rs | 21 ++++++++ src/schema/text_field.rs | 90 +++++++++++++++++++++++++++----- 12 files changed, 152 insertions(+), 46 deletions(-) diff --git a/TODO.md b/TODO.md index 72b741276..41859995f 100644 --- a/TODO.md +++ b/TODO.md @@ -11,3 +11,4 @@ use skip list for each blocks find a clear way to put the tokenized/untokenized thing upstream index frequent bigrams clean up compression +reconsider the first byte == field in the [u8] repr of a term. diff --git a/src/core/index.rs b/src/core/index.rs index d5a88e420..51ab3d9c3 100644 --- a/src/core/index.rs +++ b/src/core/index.rs @@ -258,6 +258,10 @@ impl fmt::Debug for Segment { impl Segment { + pub fn schema(&self,) -> Schema { + self.index.schema() + } + pub fn id(&self,) -> SegmentId { self.segment_id.clone() } diff --git a/src/core/merger.rs b/src/core/merger.rs index a98ad7505..130df2ea8 100644 --- a/src/core/merger.rs +++ b/src/core/merger.rs @@ -74,7 +74,7 @@ impl<'a> PostingsMerger<'a> { } postings_merger } - + // pushes the term_reader associated with the given segment ordinal // into the heap. fn push_next_segment_el(&mut self, segment_ord: usize) { @@ -96,7 +96,7 @@ impl<'a> PostingsMerger<'a> { let offset = self.doc_offsets[heap_item.segment_ord]; let reader = &self.readers[heap_item.segment_ord]; let segment_postings = reader.read_postings(&heap_item.term_info); - let offset_postings = OffsetPostings::new(segment_postings, offset); + let offset_postings = OffsetPostings::new(segment_postings, offset); segment_postings_list.push(offset_postings); } self.push_next_segment_el(heap_item.segment_ord); @@ -223,11 +223,12 @@ mod tests { use core::searcher::DocAddress; use collector::FastFieldTestCollector; use collector::TestCollector; + use schema::TextIndexingOptions; #[test] fn test_index_merger() { let mut schema = schema::Schema::new(); - let text_fieldtype = schema::TextOptions::new().set_tokenized_indexed().set_stored(); + let text_fieldtype = schema::TextOptions::new().set_indexing_options(TextIndexingOptions::TokenizedWithFreq).set_stored(); let text_field = schema.add_text_field("text", &text_fieldtype); let score_fieldtype = schema::U32Options::new().set_fast(); let score_field = schema.add_u32_field("score", &score_fieldtype); diff --git a/src/core/writer.rs b/src/core/writer.rs index d8eba23d5..8ce0a0034 100644 --- a/src/core/writer.rs +++ b/src/core/writer.rs @@ -161,7 +161,7 @@ impl SegmentWriter { let doc_id = self.max_doc; for field_value in doc.text_fields() { let field_options = schema.text_field_options(&field_value.field); - if field_options.is_tokenized_indexed() { + if field_options.indexing_options().is_tokenized() { let mut tokens = self.tokenizer.tokenize(&field_value.text); let mut pos = 0u32; loop { diff --git a/src/lib.rs b/src/lib.rs index e363fb8aa..bf1d67119 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -63,11 +63,9 @@ mod tests { #[test] fn test_indexing() { let mut schema = schema::Schema::new(); - let text_fieldtype = schema::TextOptions::new().set_tokenized_indexed(); - let text_field = schema.add_text_field("text", &text_fieldtype); + let text_field = schema.add_text_field("text", &schema::TEXT); let index = Index::create_from_tempdir(schema).unwrap(); - { // writing the segment let mut index_writer = index.writer_with_num_threads(1).unwrap(); @@ -99,8 +97,7 @@ mod tests { #[test] fn test_searcher() { let mut schema = schema::Schema::new(); - let text_fieldtype = schema::TextOptions::new().set_tokenized_indexed(); - let text_field = schema.add_text_field("text", &text_fieldtype); + let text_field = schema.add_text_field("text", &schema::TEXT); let index = Index::create_in_ram(schema); { diff --git a/src/postings/freq_handler.rs b/src/postings/freq_handler.rs index 70bf54a1c..8cec21154 100644 --- a/src/postings/freq_handler.rs +++ b/src/postings/freq_handler.rs @@ -1,5 +1,4 @@ use compression::SIMDBlockDecoder; -use DocId; pub enum FreqHandler { FreqReader(SIMDBlockDecoder), @@ -8,6 +7,11 @@ pub enum FreqHandler { } impl FreqHandler { + + pub fn new_freq_reader() -> FreqHandler { + FreqHandler::FreqReader(SIMDBlockDecoder::new()) + } + pub fn read_freq_block<'a>(&mut self, data: &'a [u8]) -> &'a [u8] { match *self { FreqHandler::FreqReader(ref mut block_decoder) => { diff --git a/src/postings/segment_postings.rs b/src/postings/segment_postings.rs index f8f44332b..3dce489d8 100644 --- a/src/postings/segment_postings.rs +++ b/src/postings/segment_postings.rs @@ -50,7 +50,7 @@ impl<'a> SegmentPostings<'a> { doc_freq: doc_freq as usize, doc_offset: 0, block_decoder: SIMDBlockDecoder::new(), - freq_reader: FreqHandler::NoFreq, + freq_reader: FreqHandler::new_freq_reader(), remaining_data: data, cur: Wrapping(usize::max_value()), } diff --git a/src/postings/serializer.rs b/src/postings/serializer.rs index 7542b2efc..d257acb76 100644 --- a/src/postings/serializer.rs +++ b/src/postings/serializer.rs @@ -1,6 +1,8 @@ use datastruct::FstMapBuilder; use super::TermInfo; use schema::Term; +use schema::Schema; +use schema::TextIndexingOptions; use directory::WritePtr; use compression::{NUM_DOCS_PER_BLOCK, SIMDBlockEncoder, CompositeEncoder}; use DocId; @@ -23,8 +25,8 @@ pub struct PostingsSerializer { doc_ids: Vec, term_freqs: Vec, position_deltas: Vec, - is_termfreq_enabled: bool, - is_positions_enabled: bool, + schema: Schema, + text_indexing_options: TextIndexingOptions, } impl PostingsSerializer { @@ -34,6 +36,7 @@ impl PostingsSerializer { let terms_fst_builder = try!(FstMapBuilder::new(terms_write)); let postings_write = try!(segment.open_write(SegmentComponent::POSTINGS)); let positions_write = try!(segment.open_write(SegmentComponent::POSITIONS)); + let schema = segment.schema(); Ok(PostingsSerializer { terms_fst_builder: terms_fst_builder, postings_write: postings_write, @@ -46,13 +49,26 @@ impl PostingsSerializer { doc_ids: Vec::new(), term_freqs: Vec::new(), position_deltas: Vec::new(), - is_positions_enabled: false, - is_termfreq_enabled: false, + schema: schema, + text_indexing_options: TextIndexingOptions::Unindexed, }) } + pub fn load_indexing_options(&mut self, term: &Term) { + self.text_indexing_options = match term.get_text_field() { + Some(text_field) => { + let text_options = self.schema.text_field_options(&text_field); + text_options.indexing_options() + } + None => { + TextIndexingOptions::Unindexed + } + }; + } + pub fn new_term(&mut self, term: &Term, doc_freq: DocId) -> io::Result<()> { try!(self.close_term()); + self.load_indexing_options(term); self.doc_ids.clear(); self.last_doc_id_encoded = 0; self.term_freqs.clear(); @@ -72,7 +88,7 @@ impl PostingsSerializer { self.written_bytes_postings += block_encoded.len(); try!(self.postings_write.write_all(block_encoded)); } - if self.is_termfreq_enabled { + if self.text_indexing_options.is_termfreq_enabled() { { let block_encoded = self.block_encoder.compress_vint_unsorted(&self.term_freqs[..]); self.written_bytes_postings += try!(VInt(block_encoded.len() as u64).serialize(&mut self.postings_write)); @@ -81,7 +97,7 @@ impl PostingsSerializer { } self.term_freqs.clear(); } - if self.is_positions_enabled { + if self.text_indexing_options.is_position_enabled() { let positions_encoded: &[u8] = self.positions_encoder.compress_unsorted(&self.position_deltas[..]); try!(self.positions_write.write_all(positions_encoded)); self.written_bytes_positions += positions_encoded.len(); @@ -95,13 +111,13 @@ impl PostingsSerializer { pub fn write_doc(&mut self, doc_id: DocId, term_freq: u32, position_deltas: &[u32]) -> io::Result<()> { self.doc_ids.push(doc_id); - if self.is_termfreq_enabled { + if self.text_indexing_options.is_termfreq_enabled() { self.term_freqs.push(term_freq as u32); } - if self.is_positions_enabled { + if self.text_indexing_options.is_position_enabled() { self.position_deltas.extend_from_slice(position_deltas); } - if self.doc_ids.len() == NUM_DOCS_PER_BLOCK { + if self.doc_ids.len() == NUM_DOCS_PER_BLOCK { { // encode the positions let block_encoded: &[u8] = self.block_encoder.compress_block_sorted(&self.doc_ids, self.last_doc_id_encoded); @@ -109,7 +125,7 @@ impl PostingsSerializer { try!(self.postings_write.write_all(block_encoded)); self.written_bytes_postings += block_encoded.len(); } - if self.is_termfreq_enabled { + if self.text_indexing_options.is_termfreq_enabled() { // encode the term_freqs let block_encoded: &[u8] = self.block_encoder.compress_block_unsorted(&self.term_freqs); try!(self.postings_write.write_all(block_encoded)); @@ -120,7 +136,7 @@ impl PostingsSerializer { } Ok(()) } - + pub fn close(mut self,) -> io::Result<()> { try!(self.close_term()); try!(self.terms_fst_builder.finish()); diff --git a/src/schema/mod.rs b/src/schema/mod.rs index 62a22dc0f..c6d913ae4 100644 --- a/src/schema/mod.rs +++ b/src/schema/mod.rs @@ -12,8 +12,9 @@ pub use self::text_field::TextFieldValue; pub use self::text_field::TextOptions; pub use self::text_field::FAST; pub use self::text_field::TEXT; +pub use self::text_field::STRING; pub use self::text_field::STORED; - +pub use self::text_field::TextIndexingOptions; pub use self::u32_field::U32Field; pub use self::u32_field::U32FieldValue; diff --git a/src/schema/schema.rs b/src/schema/schema.rs index 1a72c7f03..25293e799 100644 --- a/src/schema/schema.rs +++ b/src/schema/schema.rs @@ -33,19 +33,18 @@ pub struct U32FieldEntry { /// # Examples /// /// ``` -/// use tantivy::schema::{Schema, TextOptions}; +/// use tantivy::schema::*; /// /// fn create_schema() -> Schema { /// let mut schema = Schema::new(); /// let str_fieldtype = TextOptions::new(); -/// let text_fieldtype = TextOptions::new().set_tokenized_indexed(); -/// let id_field = schema.add_text_field("id", &str_fieldtype); -/// let url_field = schema.add_text_field("url", &str_fieldtype); -/// let body_field = schema.add_text_field("body", &text_fieldtype); -/// let id_field = schema.add_text_field("id", &str_fieldtype); -/// let url_field = schema.add_text_field("url", &str_fieldtype); -/// let title_field = schema.add_text_field("title", &text_fieldtype); -/// let body_field = schema.add_text_field("body", &text_fieldtype); +/// let id_field = schema.add_text_field("id", &STRING); +/// let url_field = schema.add_text_field("url", &STRING); +/// let body_field = schema.add_text_field("body", &TEXT); +/// let id_field = schema.add_text_field("id", &STRING); +/// let url_field = schema.add_text_field("url", &STRING); +/// let title_field = schema.add_text_field("title", &TEXT); +/// let body_field = schema.add_text_field("body", &TEXT); /// schema /// } /// diff --git a/src/schema/term.rs b/src/schema/term.rs index 5c07978fa..7535e787b 100644 --- a/src/schema/term.rs +++ b/src/schema/term.rs @@ -24,6 +24,27 @@ impl Term { } } + fn type_num(&self,) -> u8 { + self.data[0] + } + + pub fn is_u32(&self,) -> bool { + !self.is_text() + } + + pub fn is_text(&self,) -> bool { + self.type_num() & 128 == 0 + } + + pub fn get_text_field(&self,) -> Option { + if self.is_text() { + Some(TextField(self.type_num())) + } + else { + None + } + } + pub fn from_field_text(field: &TextField, text: &str) -> Term { let mut buffer = Vec::with_capacity(1 + text.len()); let TextField(field_idx) = *field; diff --git a/src/schema/text_field.rs b/src/schema/text_field.rs index ecd8340a0..e8fd12524 100644 --- a/src/schema/text_field.rs +++ b/src/schema/text_field.rs @@ -10,17 +10,74 @@ use std::ops::BitOr; #[derive(Clone,Debug,PartialEq,PartialOrd,Eq,Hash)] pub struct TextField(pub u8); +#[derive(Clone,Debug,PartialEq,PartialOrd,Eq,Hash, RustcDecodable, RustcEncodable)] +pub enum TextIndexingOptions { + Unindexed, + Untokenized, + TokenizedNoFreq, + TokenizedWithFreq, + TokenizedWithFreqAndPosition, +} + +impl TextIndexingOptions { + pub fn is_termfreq_enabled(&self) -> bool { + match *self { + TextIndexingOptions::TokenizedWithFreq => true, + TextIndexingOptions::TokenizedWithFreqAndPosition => true, + _ => false, + } + } + + pub fn is_tokenized(&self,) -> bool { + match *self { + TextIndexingOptions::TokenizedNoFreq => true, + TextIndexingOptions::TokenizedWithFreq => true, + TextIndexingOptions::TokenizedWithFreqAndPosition => true, + _ => false, + } + } + + pub fn is_position_enabled(&self,) -> bool { + match *self { + TextIndexingOptions::TokenizedWithFreqAndPosition => true, + _ => false, + } + } +} + + +impl BitOr for TextIndexingOptions { + type Output = TextIndexingOptions; + + fn bitor(self, other: TextIndexingOptions) -> TextIndexingOptions { + use super::TextIndexingOptions::*; + if self == Unindexed { + other + } + else if other == Unindexed { + self + } + else if self == other { + self + } + else { + // make it possible + panic!("Combining {:?} and {:?} is ambiguous"); + } + } +} #[derive(Clone,Debug,PartialEq,Eq, RustcDecodable, RustcEncodable)] pub struct TextOptions { - tokenized_indexed: bool, + indexing_options: TextIndexingOptions, stored: bool, fast: bool, } impl TextOptions { - pub fn is_tokenized_indexed(&self,) -> bool { - self.tokenized_indexed + + pub fn indexing_options(&self,) -> TextIndexingOptions { + self.indexing_options.clone() } pub fn is_stored(&self,) -> bool { @@ -41,15 +98,15 @@ impl TextOptions { self } - pub fn set_tokenized_indexed(mut self,) -> TextOptions { - self.tokenized_indexed = true; + pub fn set_indexing_options(mut self, indexing_options: TextIndexingOptions) -> TextOptions { + self.indexing_options = indexing_options; self } pub fn new() -> TextOptions { TextOptions { fast: false, - tokenized_indexed: false, + indexing_options: TextIndexingOptions::Unindexed, stored: false, } } @@ -94,12 +151,17 @@ pub struct TextFieldValue { } - +/// The field will be untokenized and indexed +pub const STRING: TextOptions = TextOptions { + indexing_options: TextIndexingOptions::Untokenized, + stored: false, + fast: false, +}; /// The field will be tokenized and indexed pub const TEXT: TextOptions = TextOptions { - tokenized_indexed: true, + indexing_options: TextIndexingOptions::TokenizedWithFreqAndPosition, stored: false, fast: false, }; @@ -109,7 +171,7 @@ pub const TEXT: TextOptions = TextOptions { /// Reading the stored fields of a document is relatively slow. /// (100 microsecs) pub const STORED: TextOptions = TextOptions { - tokenized_indexed: false, + indexing_options: TextIndexingOptions::Unindexed, stored: true, fast: false, }; @@ -117,7 +179,7 @@ pub const STORED: TextOptions = TextOptions { /// Fast field are used for field you need to access many times during /// collection. (e.g: for sort, aggregates). pub const FAST: TextOptions = TextOptions { - tokenized_indexed: false, + indexing_options: TextIndexingOptions::Unindexed, stored: false, fast: true }; @@ -129,7 +191,7 @@ impl BitOr for TextOptions { fn bitor(self, other: TextOptions) -> TextOptions { let mut res = TextOptions::new(); - res.tokenized_indexed = self.tokenized_indexed || other.tokenized_indexed; + res.indexing_options = self.indexing_options | other.indexing_options; res.stored = self.stored || other.stored; res.fast = self.fast || other.fast; res @@ -148,19 +210,19 @@ mod tests { let field_options = STORED | FAST; assert!(field_options.is_stored()); assert!(field_options.is_fast()); - assert!(!field_options.is_tokenized_indexed()); + assert!(!field_options.indexing_options().is_tokenized()); } { let field_options = STORED | TEXT; assert!(field_options.is_stored()); assert!(!field_options.is_fast()); - assert!(field_options.is_tokenized_indexed()); + assert!(field_options.indexing_options().is_tokenized()); } { let mut schema = Schema::new(); let _body_field: TextField = schema.add_text_field("body", &TEXT); let field = schema.text_field("body"); - assert!(schema.text_field_options(&field).is_tokenized_indexed()); + assert!(schema.text_field_options(&field).indexing_options().is_tokenized()); } } }