From 4d65771e048f43ebc9f14ec0d28da7925045bf7c Mon Sep 17 00:00:00 2001 From: Paul Masurel Date: Mon, 26 Mar 2018 13:25:29 +0900 Subject: [PATCH] field norm reader is not an option anymore. --- src/core/segment_reader.rs | 48 +++++++++++++----------- src/indexer/merger.rs | 18 ++++----- src/lib.rs | 31 +++++++++++++++- src/postings/mod.rs | 2 +- src/query/intersection.rs | 26 ++++++++----- src/query/phrase_query/phrase_weight.rs | 2 +- src/query/term_query/term_scorer.rs | 19 ++++++++-- src/query/term_query/term_weight.rs | 49 ++++++++++--------------- 8 files changed, 119 insertions(+), 76 deletions(-) diff --git a/src/core/segment_reader.rs b/src/core/segment_reader.rs index 764b7d927..d6a3fd2d0 100644 --- a/src/core/segment_reader.rs +++ b/src/core/segment_reader.rs @@ -85,6 +85,7 @@ impl SegmentReader { .unwrap_or(0u32) } + /// Returns true iff some of the documents of the segment have been deleted. pub fn has_deletes(&self) -> bool { self.delete_bitset().is_some() } @@ -105,12 +106,12 @@ impl SegmentReader { ) -> fastfield::Result> { let field_entry = self.schema.get_field_entry(field); if Item::fast_field_cardinality(field_entry.field_type()) == Some(Cardinality::SingleValue) - { - self.fast_fields_composite - .open_read(field) - .ok_or_else(|| FastFieldNotAvailableError::new(field_entry)) - .map(FastFieldReader::open) - } else { + { + self.fast_fields_composite + .open_read(field) + .ok_or_else(|| FastFieldNotAvailableError::new(field_entry)) + .map(FastFieldReader::open) + } else { Err(FastFieldNotAvailableError::new(field_entry)) } } @@ -123,17 +124,17 @@ impl SegmentReader { ) -> fastfield::Result> { let field_entry = self.schema.get_field_entry(field); if Item::fast_field_cardinality(field_entry.field_type()) == Some(Cardinality::MultiValues) - { - let idx_reader = self.fast_fields_composite - .open_read_with_idx(field, 0) - .ok_or_else(|| FastFieldNotAvailableError::new(field_entry)) - .map(FastFieldReader::open)?; - let vals_reader = self.fast_fields_composite - .open_read_with_idx(field, 1) - .ok_or_else(|| FastFieldNotAvailableError::new(field_entry)) - .map(FastFieldReader::open)?; - Ok(MultiValueIntFastFieldReader::open(idx_reader, vals_reader)) - } else { + { + let idx_reader = self.fast_fields_composite + .open_read_with_idx(field, 0) + .ok_or_else(|| FastFieldNotAvailableError::new(field_entry)) + .map(FastFieldReader::open)?; + let vals_reader = self.fast_fields_composite + .open_read_with_idx(field, 1) + .ok_or_else(|| FastFieldNotAvailableError::new(field_entry)) + .map(FastFieldReader::open)?; + Ok(MultiValueIntFastFieldReader::open(idx_reader, vals_reader)) + } else { Err(FastFieldNotAvailableError::new(field_entry)) } } @@ -170,10 +171,15 @@ impl SegmentReader { /// /// They are simply stored as a fast field, serialized in /// the `.fieldnorm` file of the segment. - pub fn get_fieldnorms_reader(&self, field: Field) -> Option { - self.fieldnorms_composite - .open_read(field) - .map(FieldNormReader::open) + pub fn get_fieldnorms_reader(&self, field: Field) -> FieldNormReader { + if let Some(fieldnorm_source) = self.fieldnorms_composite + .open_read(field) { + FieldNormReader::open(fieldnorm_source) + } else { + let field_name = self.schema.get_field_name(field); + let err_msg= format!("Field norm not found for field {:?}. Was it market as indexed during indexing.", field_name); + panic!(err_msg); + } } /// Accessor to the segment's `StoreReader`. diff --git a/src/indexer/merger.rs b/src/indexer/merger.rs index 1d9155b28..00beb1633 100644 --- a/src/indexer/merger.rs +++ b/src/indexer/merger.rs @@ -30,12 +30,11 @@ fn compute_total_num_tokens(readers: &[SegmentReader], field: Field) -> u64 { if reader.has_deletes() { // if there are deletes, then we use an approximation // using the fieldnorm - if let Some(fieldnorms_reader) = reader.get_fieldnorms_reader(field) { - for doc in 0..reader.max_doc() { - if !reader.is_deleted(doc) { - let fieldnorm_id = fieldnorms_reader.fieldnorm_id(doc); - count[fieldnorm_id as usize] += 1; - } + let fieldnorms_reader = reader.get_fieldnorms_reader(field); + for doc in 0..reader.max_doc() { + if !reader.is_deleted(doc) { + let fieldnorm_id = fieldnorms_reader.fieldnorm_id(doc); + count[fieldnorm_id as usize] += 1; } } } else { @@ -133,13 +132,10 @@ impl IndexMerger { for field in fields { fieldnorms_data.clear(); for reader in &self.readers { - let fieldnorms_reader_opt = reader.get_fieldnorms_reader(field); + let fieldnorms_reader = reader.get_fieldnorms_reader(field); for doc_id in 0..reader.max_doc() { if !reader.is_deleted(doc_id) { - let fieldnorm_id = fieldnorms_reader_opt - .as_ref() - .map(|reader| reader.fieldnorm_id(doc_id)) - .unwrap_or(0u8); + let fieldnorm_id = fieldnorms_reader.fieldnorm_id(doc_id); fieldnorms_data.push(fieldnorm_id); } } diff --git a/src/lib.rs b/src/lib.rs index c5d0ebddc..e2469b24c 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -399,6 +399,35 @@ mod tests { } } + #[test] + fn test_fieldnorm_no_docs_with_field() { + let mut schema_builder = SchemaBuilder::default(); + let title_field = schema_builder.add_text_field("title", TEXT); + let text_field = schema_builder.add_text_field("text", TEXT); + let index = Index::create_in_ram(schema_builder.build()); + { + let mut index_writer = index.writer_with_num_threads(1, 40_000_000).unwrap(); + { + let doc = doc!(text_field=>"a b c"); + index_writer.add_document(doc); + } + index_writer.commit().unwrap(); + } + { + index.load_searchers().unwrap(); + let searcher = index.searcher(); + let reader = searcher.segment_reader(0); + { + let fieldnorm_reader = reader.get_fieldnorms_reader(text_field); + assert_eq!(fieldnorm_reader.fieldnorm(0), 3); + } + { + let fieldnorm_reader = reader.get_fieldnorms_reader(title_field); + assert_eq!(fieldnorm_reader.fieldnorm_id(0), 0); + } + } + } + #[test] fn test_fieldnorm() { let mut schema_builder = SchemaBuilder::default(); @@ -424,7 +453,7 @@ mod tests { index.load_searchers().unwrap(); let searcher = index.searcher(); let segment_reader: &SegmentReader = searcher.segment_reader(0); - let fieldnorms_reader = segment_reader.get_fieldnorms_reader(text_field).unwrap(); + let fieldnorms_reader = segment_reader.get_fieldnorms_reader(text_field); assert_eq!(fieldnorms_reader.fieldnorm(0), 3); assert_eq!(fieldnorms_reader.fieldnorm(1), 0); assert_eq!(fieldnorms_reader.fieldnorm(2), 2); diff --git a/src/postings/mod.rs b/src/postings/mod.rs index aa43b9ec0..75c2e4c96 100644 --- a/src/postings/mod.rs +++ b/src/postings/mod.rs @@ -208,7 +208,7 @@ pub mod tests { { let segment_reader = SegmentReader::open(&segment).unwrap(); { - let fieldnorm_reader = segment_reader.get_fieldnorms_reader(text_field).unwrap(); + let fieldnorm_reader = segment_reader.get_fieldnorms_reader(text_field) ; assert_eq!(fieldnorm_reader.fieldnorm(0), 8 + 5); assert_eq!(fieldnorm_reader.fieldnorm(1), 2); for i in 2..1000 { diff --git a/src/query/intersection.rs b/src/query/intersection.rs index 75598b237..35a5ab33e 100644 --- a/src/query/intersection.rs +++ b/src/query/intersection.rs @@ -7,12 +7,20 @@ use std::borrow::Borrow; use Score; use query::term_query::{TermScorerNoDeletes, TermScorerWithDeletes}; -pub fn intersect_scorers(mut docsets: Vec>) -> Box { - let num_docsets = docsets.len(); - docsets.sort_by(|left, right| right.size_hint().cmp(&left.size_hint())); - let rarest_opt = docsets.pop(); - let second_rarest_opt = docsets.pop(); - docsets.reverse(); +/// Returns the intersection scorer. +/// +/// The score associated to the documents is the sum of the +/// score of the `Scorer`s given in argument. +/// +/// For better performance, the function uses a +/// specialized implementation if the two +/// shortest scorers are `TermScorer`s. +pub fn intersect_scorers(mut scorers: Vec>) -> Box { + let num_docsets = scorers.len(); + scorers.sort_by(|left, right| right.size_hint().cmp(&left.size_hint())); + let rarest_opt = scorers.pop(); + let second_rarest_opt = scorers.pop(); + scorers.reverse(); match (rarest_opt, second_rarest_opt) { (None, None) => box EmptyScorer, (Some(single_docset), None) => single_docset, @@ -27,7 +35,7 @@ pub fn intersect_scorers(mut docsets: Vec>) -> Box { return box Intersection { left, right, - others: docsets, + others: scorers, num_docsets } } @@ -43,7 +51,7 @@ pub fn intersect_scorers(mut docsets: Vec>) -> Box { return box Intersection { left, right, - others: docsets, + others: scorers, num_docsets } } @@ -52,7 +60,7 @@ pub fn intersect_scorers(mut docsets: Vec>) -> Box { return box Intersection { left, right, - others: docsets, + others: scorers, num_docsets } } diff --git a/src/query/phrase_query/phrase_weight.rs b/src/query/phrase_query/phrase_weight.rs index 302230127..b6ec4d528 100644 --- a/src/query/phrase_query/phrase_weight.rs +++ b/src/query/phrase_query/phrase_weight.rs @@ -31,7 +31,7 @@ impl Weight for PhraseWeight { fn scorer(&self, reader: &SegmentReader) -> Result> { let similarity_weight = self.similarity_weight.clone(); let field = self.phrase_terms[0].field(); - let fieldnorm_reader = reader.get_fieldnorms_reader(field).expect("Failed to find fieldnorm for field"); + let fieldnorm_reader = reader.get_fieldnorms_reader(field); if reader.has_deletes() { let mut term_postings_list = Vec::new(); for term in &self.phrase_terms { diff --git a/src/query/term_query/term_scorer.rs b/src/query/term_query/term_scorer.rs index a85c1476e..3d9473f70 100644 --- a/src/query/term_query/term_scorer.rs +++ b/src/query/term_query/term_scorer.rs @@ -8,9 +8,22 @@ use fieldnorm::FieldNormReader; use query::bm25::BM25Weight; pub struct TermScorer { - pub fieldnorm_reader: FieldNormReader, - pub postings: TPostings, - pub similarity_weight: BM25Weight, + postings: TPostings, + fieldnorm_reader: FieldNormReader, + similarity_weight: BM25Weight, +} + + +impl TermScorer { + pub fn new(postings: TPostings, + fieldnorm_reader: FieldNormReader, + similarity_weight: BM25Weight) -> TermScorer { + TermScorer { + postings, + fieldnorm_reader, + similarity_weight, + } + } } impl DocSet for TermScorer { diff --git a/src/query/term_query/term_weight.rs b/src/query/term_query/term_weight.rs index 6d50cba6a..ee8f75d1e 100644 --- a/src/query/term_query/term_weight.rs +++ b/src/query/term_query/term_weight.rs @@ -22,44 +22,35 @@ impl Weight for TermWeight { fn scorer(&self, reader: &SegmentReader) -> Result> { let field = self.term.field(); let inverted_index = reader.inverted_index(field); - let fieldnorm_reader = reader.get_fieldnorms_reader(field).expect("Failed to find fieldnorm reader for field."); - let scorer: Box; + let fieldnorm_reader = reader.get_fieldnorms_reader(field); + let similarity_weight = self.similarity_weight.clone(); if reader.has_deletes() { let postings_opt: Option> = inverted_index.read_postings(&self.term, self.index_record_option); - scorer = if let Some(segment_postings) = postings_opt { - box TermScorer { - fieldnorm_reader, - postings: segment_postings, - similarity_weight: self.similarity_weight.clone() - } + Ok(box TermScorer::new(segment_postings, + fieldnorm_reader, + similarity_weight)) } else { - box TermScorer { + Ok(box TermScorer::new( + SegmentPostings::::empty(), fieldnorm_reader, - postings: SegmentPostings::::empty(), - similarity_weight: self.similarity_weight.clone() - } - }; + similarity_weight)) + } } else { let postings_opt: Option> = - inverted_index.read_postings_no_deletes(&self.term, self.index_record_option); - scorer = - if let Some(segment_postings) = postings_opt { - box TermScorer { - fieldnorm_reader, - postings: segment_postings, - similarity_weight: self.similarity_weight.clone() - } - } else { - box TermScorer { - fieldnorm_reader, - postings: SegmentPostings::::empty(), - similarity_weight: self.similarity_weight.clone() - } - }; + inverted_index.read_postings_no_deletes(&self.term, self.index_record_option); + if let Some(segment_postings) = postings_opt { + Ok(box TermScorer::new(segment_postings, + fieldnorm_reader, + similarity_weight)) + } else { + Ok(box TermScorer::new( + SegmentPostings::::empty(), + fieldnorm_reader, + similarity_weight)) + } } - Ok(scorer) } fn count(&self, reader: &SegmentReader) -> Result {